-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinsics #120363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,7 @@ | |
#include "llvm/Support/Regex.h" | ||
#include "llvm/TargetParser/Triple.h" | ||
#include <cstring> | ||
#include <numeric> | ||
|
||
using namespace llvm; | ||
|
||
|
@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, | |
return true; | ||
} | ||
} | ||
|
||
// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc. | ||
if (Name.starts_with("bfcvt")) { | ||
NewFn = nullptr; | ||
return true; | ||
} | ||
|
||
return false; // No other 'aarch64.neon.*'. | ||
} | ||
if (Name.consume_front("sve.")) { | ||
|
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, | |
|
||
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, | ||
Function *F, IRBuilder<> &Builder) { | ||
Intrinsic::ID NewID = | ||
StringSwitch<Intrinsic::ID>(Name) | ||
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2) | ||
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2) | ||
.Default(Intrinsic::not_intrinsic); | ||
if (NewID == Intrinsic::not_intrinsic) | ||
llvm_unreachable("Unhandled Intrinsic!"); | ||
|
||
SmallVector<Value *, 3> Args(CI->args()); | ||
|
||
// The original intrinsics incorrectly used a predicate based on the smallest | ||
// element type rather than the largest. | ||
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8); | ||
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4); | ||
|
||
if (Args[1]->getType() != BadPredTy) | ||
llvm_unreachable("Unexpected predicate type!"); | ||
|
||
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, | ||
BadPredTy, Args[1]); | ||
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, | ||
GoodPredTy, Args[1]); | ||
|
||
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr, | ||
CI->getName()); | ||
if (Name.starts_with("neon.bfcvt")) { | ||
if (Name.starts_with("neon.bfcvtn2")) { | ||
SmallVector<int, 32> LoMask(4); | ||
std::iota(LoMask.begin(), LoMask.end(), 0); | ||
SmallVector<int, 32> ConcatMask(8); | ||
std::iota(ConcatMask.begin(), ConcatMask.end(), 0); | ||
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask); | ||
Value *Trunc = | ||
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType()); | ||
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask); | ||
} else if (Name.starts_with("neon.bfcvtn")) { | ||
SmallVector<int, 32> ConcatMask(8); | ||
std::iota(ConcatMask.begin(), ConcatMask.end(), 0); | ||
Type *V4BF16 = | ||
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4); | ||
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16); | ||
dbgs() << "Trunc: " << *Trunc << "\n"; | ||
return Builder.CreateShuffleVector( | ||
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask); | ||
} else { | ||
return Builder.CreateFPTrunc(CI->getOperand(0), | ||
Type::getBFloatTy(F->getContext())); | ||
} | ||
} else if (Name.starts_with("sve.fcvt")) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Last question about testing: if not mistaken, I don't see SVE test changes. Is it expected not to change codegen for the sve tests, or are we missing some coverage? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The sve.fcvt is what this function (upgradeAArch64IntrinsicCall) was previously handling, so will be NFC in that regard. It gets here because of these lines from upgradeArmOrAarch64IntrinsicFunction.
It's now just inside an if to be more clear. It has its own tests that are still doing OK. |
||
Intrinsic::ID NewID = | ||
StringSwitch<Intrinsic::ID>(Name) | ||
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2) | ||
.Case("sve.fcvtnt.bf16f32", | ||
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2) | ||
.Default(Intrinsic::not_intrinsic); | ||
if (NewID == Intrinsic::not_intrinsic) | ||
llvm_unreachable("Unhandled Intrinsic!"); | ||
|
||
SmallVector<Value *, 3> Args(CI->args()); | ||
|
||
// The original intrinsics incorrectly used a predicate based on the | ||
// smallest element type rather than the largest. | ||
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8); | ||
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4); | ||
|
||
if (Args[1]->getType() != BadPredTy) | ||
llvm_unreachable("Unexpected predicate type!"); | ||
|
||
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, | ||
BadPredTy, Args[1]); | ||
Args[1] = Builder.CreateIntrinsic( | ||
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]); | ||
|
||
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr, | ||
CI->getName()); | ||
} | ||
|
||
llvm_unreachable("Unhandled Intrinsic!"); | ||
} | ||
|
||
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm> | |
|
||
let mayRaiseFPException = 1, Uses = [FPCR] in | ||
class SIMD_BFCVTN | ||
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, | ||
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't looked at how this change is used in the base class to be honest, but is my guess correct that due to this change, we see this line disappearing in llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that sounds right. It is needed to make the patterns match properly, as a bfcvtn will naturally produce a 64bit vector, and v4bf16 is a 64bit vector. Other instructions that use SIMDMixedTwoVector like XTN use the same type. |
||
"bfcvtn", ".4h", ".4s", | ||
[(set (v8bf16 V128:$Rd), | ||
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; | ||
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>; | ||
|
||
let mayRaiseFPException = 1, Uses = [FPCR] in | ||
class SIMD_BFCVTN2 | ||
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, | ||
"bfcvtn2", ".8h", ".4s", | ||
[(set (v8bf16 V128:$dst), | ||
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; | ||
"bfcvtn2", ".8h", ".4s", []>; | ||
|
||
let mayRaiseFPException = 1, Uses = [FPCR] in | ||
class BF16ToSinglePrecision<string asm> | ||
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", | ||
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, | ||
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>, | ||
Sched<[WriteFCvt]> { | ||
bits<5> Rd; | ||
bits<5> Rn; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure I am following what the result TMP4 represents here: it is an 8 element vector,
where the first 4 elements come from INACTIVE, and the other 4 elements the truncated floats. Is that right? How does that match up with "BFCVTN2 instruction writes the results to the upper half of the destination vector without affecting the other bits in the register"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep - the bfcvtn2 instruction will take the bottom half of the first input (INACTIVE, the bottom half is TMP2), and insert the top half from truncating A and inserting them into the top half. From the compilers point of view the first operand (the "destination" vector) is both an input and an output. TMP4 is the concat of TMP2, with TMP3 now being the upper bits.
I gave it another test and compiling this test with clang still produces the same assembly as before, still producing
bfcvtn2 v0.8h, v1.4s
.