Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinsics #120363

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions 10 clang/include/clang/Basic/arm_neon.td
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
def OP_VCVT_F32_BF16_HI
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;

def OP_VCVT_BF16_F32_LO_A64
: Op<(call "__a64_vcvtq_low_bf16", $p0)>;
def OP_VCVT_BF16_F32_A64
: Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;

def OP_VCVT_BF16_F32_A32
: Op<(call "__a32_vcvt_bf16", $p0)>;

Expand Down Expand Up @@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
}

let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;

def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
Expand Down
41 changes: 38 additions & 3 deletions 41 clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7307,7 +7307,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
};

static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
NEONMAP0(splat_lane_v),
NEONMAP0(splat_laneq_v),
NEONMAP0(splatq_lane_v),
Expand Down Expand Up @@ -7407,7 +7406,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(vcvtq_f16_s16),
NEONMAP0(vcvtq_f16_u16),
NEONMAP0(vcvtq_f32_v),
NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
NEONMAP0(vcvtq_high_bf16_f32),
NEONMAP0(vcvtq_low_bf16_f32),
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
Expand Down Expand Up @@ -7616,7 +7616,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
NEONMAP0(vcvth_bf16_f32),
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
Expand Down Expand Up @@ -12083,6 +12083,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return ConstantInt::get(Builder.getInt32Ty(), 0);
}

if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
return Builder.CreateFPTrunc(
Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
Builder.getFloatTy()),
Builder.getBFloatTy());

// Handle MSVC intrinsics before argument evaluation to prevent double
// evaluation.
if (std::optional<MSVCIntrin> MsvcIntId =
Expand Down Expand Up @@ -12808,6 +12814,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
"vgetq_lane");
}
case NEON::BI__builtin_neon_vcvt_bf16_f32: {
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
}
case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
SmallVector<int, 16> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
llvm::Value *Trunc =
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
return Builder.CreateShuffleVector(
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
}
case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
SmallVector<int, 16> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
SmallVector<int, 16> LoMask(4);
std::iota(LoMask.begin(), LoMask.end(), 0);
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
llvm::Value *Inactive = Builder.CreateShuffleVector(
Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
llvm::Value *Trunc =
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
}

case clang::AArch64::BI_InterlockedAdd:
case clang::AArch64::BI_InterlockedAdd64: {
Expand Down
23 changes: 11 additions & 12 deletions 23 clang/test/CodeGen/arm-bf16-convert-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down Expand Up @@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down Expand Up @@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure I am following what the result TMP4 represents here: it is an 8 element vector,
where the first 4 elements come from INACTIVE, and the other 4 elements the truncated floats. Is that right? How does that match up with "BFCVTN2 instruction writes the results to the upper half of the destination vector without affecting the other bits in the register"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep - the bfcvtn2 instruction will take the bottom half of the first input (INACTIVE, the bottom half is TMP2), and insert the top half from truncating A and inserting them into the top half. From the compilers point of view the first operand (the "destination" vector) is both an input and an output. TMP4 is the concat of TMP2, with TMP3 now being the upper bits.

I gave it another test and compiling this test with clang still produces the same assembly as before, still producing bfcvtn2 v0.8h, v1.4s.

// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down Expand Up @@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {

// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
// CHECK-A64-NEXT: ret bfloat [[TMP0]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down
11 changes: 0 additions & 11 deletions 11 llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;


// v8.6-A Bfloat Intrinsics
def int_aarch64_neon_bfcvt
: DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
def int_aarch64_neon_bfcvtn
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_aarch64_neon_bfcvtn2
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
[llvm_v8bf16_ty, llvm_v4f32_ty],
[IntrNoMem]>;

// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
Expand Down
86 changes: 61 additions & 25 deletions 86 llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstring>
#include <numeric>

using namespace llvm;

Expand Down Expand Up @@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return true;
}
}

// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
if (Name.starts_with("bfcvt")) {
NewFn = nullptr;
return true;
}

return false; // No other 'aarch64.neon.*'.
}
if (Name.consume_front("sve.")) {
Expand Down Expand Up @@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,

static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
Intrinsic::ID NewID =
StringSwitch<Intrinsic::ID>(Name)
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
.Default(Intrinsic::not_intrinsic);
if (NewID == Intrinsic::not_intrinsic)
llvm_unreachable("Unhandled Intrinsic!");

SmallVector<Value *, 3> Args(CI->args());

// The original intrinsics incorrectly used a predicate based on the smallest
// element type rather than the largest.
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);

if (Args[1]->getType() != BadPredTy)
llvm_unreachable("Unexpected predicate type!");

Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
BadPredTy, Args[1]);
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
GoodPredTy, Args[1]);

return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
CI->getName());
if (Name.starts_with("neon.bfcvt")) {
if (Name.starts_with("neon.bfcvtn2")) {
SmallVector<int, 32> LoMask(4);
std::iota(LoMask.begin(), LoMask.end(), 0);
SmallVector<int, 32> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
Value *Trunc =
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
} else if (Name.starts_with("neon.bfcvtn")) {
SmallVector<int, 32> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
Type *V4BF16 =
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
dbgs() << "Trunc: " << *Trunc << "\n";
return Builder.CreateShuffleVector(
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
} else {
return Builder.CreateFPTrunc(CI->getOperand(0),
Type::getBFloatTy(F->getContext()));
}
} else if (Name.starts_with("sve.fcvt")) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Last question about testing: if not mistaken, I don't see SVE test changes. Is it expected not to change codegen for the sve tests, or are we missing some coverage?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sve.fcvt is what this function (upgradeAArch64IntrinsicCall) was previously handling, so will be NFC in that regard. It gets here because of these lines from upgradeArmOrAarch64IntrinsicFunction.

      // 'aarch64.sve.fcvt.bf16f32' || 'aarch64.sve.fcvtnt.bf16f32'
      if (Name == "fcvt.bf16f32" || Name == "fcvtnt.bf16f32") {
        NewFn = nullptr;

It's now just inside an if to be more clear. It has its own tests that are still doing OK.

Intrinsic::ID NewID =
StringSwitch<Intrinsic::ID>(Name)
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
.Case("sve.fcvtnt.bf16f32",
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
.Default(Intrinsic::not_intrinsic);
if (NewID == Intrinsic::not_intrinsic)
llvm_unreachable("Unhandled Intrinsic!");

SmallVector<Value *, 3> Args(CI->args());

// The original intrinsics incorrectly used a predicate based on the
// smallest element type rather than the largest.
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);

if (Args[1]->getType() != BadPredTy)
llvm_unreachable("Unexpected predicate type!");

Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
BadPredTy, Args[1]);
Args[1] = Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);

return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
CI->getName());
}

llvm_unreachable("Unhandled Intrinsic!");
}

static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Expand Down
11 changes: 4 additions & 7 deletions 11 llvm/lib/Target/AArch64/AArch64InstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>

let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't looked at how this change is used in the base class to be honest, but is my guess correct that due to this change, we see this line disappearing in llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll:

kill: def $d0 killed $d0 killed $q0

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that sounds right. It is needed to make the patterns match properly, as a bfcvtn will naturally produce a 64bit vector, and v4bf16 is a 64bit vector. Other instructions that use SIMDMixedTwoVector like XTN use the same type.

"bfcvtn", ".4h", ".4s",
[(set (v8bf16 V128:$Rd),
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;

let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN2
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
"bfcvtn2", ".8h", ".4s",
[(set (v8bf16 V128:$dst),
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
"bfcvtn2", ".8h", ".4s", []>;

let mayRaiseFPException = 1, Uses = [FPCR] in
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
Expand Down
24 changes: 13 additions & 11 deletions 24 llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1454,8 +1454,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;

def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
(EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
(BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;

// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
Expand All @@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot

let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
// Round FP32 to BF16.
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
}

// ARMv8.6A AArch64 matrix multiplication
Expand Down Expand Up @@ -10410,9 +10408,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
(v8bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
(INSERT_SUBREG (IMPLICIT_DEF),
(v4bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;

let Predicates = [HasNoBF16] in
Expand Down Expand Up @@ -10447,10 +10447,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
(v8bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
(INSERT_SUBREG (IMPLICIT_DEF),
(v4bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;

Expand Down
3 changes: 3 additions & 0 deletions 3 llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s

; This test acts to test the old neon.bfcvt intrinsics, which are now
; autoupgraded to fptrunc operations.

declare bfloat @llvm.aarch64.neon.bfcvt(float)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
Expand Down
Loading
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.