llvm · davemgreen · Jan 21, 2025 · Jan 15, 2025 · sjoerdmeijer · Jan 16, 2025
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
 def OP_VCVT_F32_BF16_HI
    : Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;

-def OP_VCVT_BF16_F32_LO_A64
-    : Op<(call "__a64_vcvtq_low_bf16", $p0)>;
-def OP_VCVT_BF16_F32_A64
-    : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
-
 def OP_VCVT_BF16_F32_A32
    : Op<(call "__a32_vcvt_bf16", $p0)>;

@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
 }

 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
-  def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
-  def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
+  def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
  def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
-  def VCVT_BF16_F32 : SOpInst<"vcvt_bf16",    "BQ", "f", OP_VCVT_BF16_F32_A64>;
+  def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;

  def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
  def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7307,7 +7307,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
 };

 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
-  NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
  NEONMAP0(splat_lane_v),
  NEONMAP0(splat_laneq_v),
  NEONMAP0(splatq_lane_v),
@@ -7407,7 +7406,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
  NEONMAP0(vcvtq_f16_s16),
  NEONMAP0(vcvtq_f16_u16),
  NEONMAP0(vcvtq_f32_v),
-  NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
+  NEONMAP0(vcvtq_high_bf16_f32),
+  NEONMAP0(vcvtq_low_bf16_f32),
  NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
  NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
  NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7616,7 +7616,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
  NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
  NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
  NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
+  NEONMAP0(vcvth_bf16_f32),
  NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
  NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
  NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12083,6 +12083,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
    return ConstantInt::get(Builder.getInt32Ty(), 0);
  }

+  if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
+    return Builder.CreateFPTrunc(
+        Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+                              Builder.getFloatTy()),
+        Builder.getBFloatTy());
+
  // Handle MSVC intrinsics before argument evaluation to prevent double
  // evaluation.
  if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12808,6 +12814,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                        "vgetq_lane");
  }
+  case NEON::BI__builtin_neon_vcvt_bf16_f32: {
+    llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+    llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+    return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+  }
+  case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
+    SmallVector<int, 16> ConcatMask(8);
+    std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+    llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+    llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+    llvm::Value *Trunc =
+        Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+    return Builder.CreateShuffleVector(
+        Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+  }
+  case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
+    SmallVector<int, 16> ConcatMask(8);
+    std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+    SmallVector<int, 16> LoMask(4);
+    std::iota(LoMask.begin(), LoMask.end(), 0);
+    llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+    llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+    llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
+    llvm::Value *Inactive = Builder.CreateShuffleVector(
+        Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
+    llvm::Value *Trunc =
+        Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
+    return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+  }

  case clang::AArch64::BI_InterlockedAdd:
  case clang::AArch64::BI_InterlockedAdd64: {

diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    ret <4 x bfloat> [[TMP1]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP2]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {

 // CHECK-A64-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
-// CHECK-A64-NEXT:    ret bfloat [[VCVTH_BF16_F32_I]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
+// CHECK-A64-NEXT:    ret bfloat [[TMP0]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
  def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
  def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;

-
-  // v8.6-A Bfloat Intrinsics
-  def int_aarch64_neon_bfcvt
-    : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_aarch64_neon_bfcvtn
-    : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_aarch64_neon_bfcvtn2
-    : DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
-                [llvm_v8bf16_ty, llvm_v4f32_ty],
-                [IntrNoMem]>;
-
  // v8.2-A FP16 Fused Multiply-Add Long
  def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
  def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstring>
+#include <numeric>

 using namespace llvm;

@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
          return true;
        }
      }
+
+      // Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
+      if (Name.starts_with("bfcvt")) {
+        NewFn = nullptr;
+        return true;
+      }
+
      return false; // No other 'aarch64.neon.*'.
    }
    if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,

 static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
                                          Function *F, IRBuilder<> &Builder) {
-  Intrinsic::ID NewID =
-      StringSwitch<Intrinsic::ID>(Name)
-          .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
-          .Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
-          .Default(Intrinsic::not_intrinsic);
-  if (NewID == Intrinsic::not_intrinsic)
-    llvm_unreachable("Unhandled Intrinsic!");
-
-  SmallVector<Value *, 3> Args(CI->args());
-
-  // The original intrinsics incorrectly used a predicate based on the smallest
-  // element type rather than the largest.
-  Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
-  Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
-
-  if (Args[1]->getType() != BadPredTy)
-    llvm_unreachable("Unexpected predicate type!");
-
-  Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
-                                    BadPredTy, Args[1]);
-  Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
-                                    GoodPredTy, Args[1]);
-
-  return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
-                                 CI->getName());
+  if (Name.starts_with("neon.bfcvt")) {
+    if (Name.starts_with("neon.bfcvtn2")) {
+      SmallVector<int, 32> LoMask(4);
+      std::iota(LoMask.begin(), LoMask.end(), 0);
+      SmallVector<int, 32> ConcatMask(8);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
+      Value *Trunc =
+          Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
+      return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+    } else if (Name.starts_with("neon.bfcvtn")) {
+      SmallVector<int, 32> ConcatMask(8);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Type *V4BF16 =
+          FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
+      Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
+      dbgs() << "Trunc: " << *Trunc << "\n";
+      return Builder.CreateShuffleVector(
+          Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+    } else {
+      return Builder.CreateFPTrunc(CI->getOperand(0),
+                                   Type::getBFloatTy(F->getContext()));
+    }
+  } else if (Name.starts_with("sve.fcvt")) {
+    Intrinsic::ID NewID =
+        StringSwitch<Intrinsic::ID>(Name)
+            .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
+            .Case("sve.fcvtnt.bf16f32",
+                  Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
+            .Default(Intrinsic::not_intrinsic);
+    if (NewID == Intrinsic::not_intrinsic)
+      llvm_unreachable("Unhandled Intrinsic!");
+
+    SmallVector<Value *, 3> Args(CI->args());
+
+    // The original intrinsics incorrectly used a predicate based on the
+    // smallest element type rather than the largest.
+    Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
+    Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
+
+    if (Args[1]->getType() != BadPredTy)
+      llvm_unreachable("Unexpected predicate type!");
+
+    Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
+                                      BadPredTy, Args[1]);
+    Args[1] = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
+
+    return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
+                                   CI->getName());
+  }
+
+  llvm_unreachable("Unhandled Intrinsic!");
 }

 static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,

diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>

 let mayRaiseFPException = 1, Uses = [FPCR] in
 class SIMD_BFCVTN
-  : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+  : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
                           "bfcvtn", ".4h", ".4s",
-    [(set (v8bf16 V128:$Rd),
-          (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+    [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;

 let mayRaiseFPException = 1, Uses = [FPCR] in
 class SIMD_BFCVTN2
  : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
-                           "bfcvtn2", ".8h", ".4s",
-    [(set (v8bf16 V128:$dst),
-          (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+                               "bfcvtn2", ".8h", ".4s", []>;

 let mayRaiseFPException = 1, Uses = [FPCR] in
 class BF16ToSinglePrecision<string asm>
  : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
-    [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+    [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
    Sched<[WriteFCvt]> {
  bits<5> Rd;
  bits<5> Rn;

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1454,8 +1454,8 @@ def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
 def BFCVTN       : SIMD_BFCVTN;
 def BFCVTN2      : SIMD_BFCVTN2;

-def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
-          (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
+def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
+          (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;

 // Vector-scalar BFDOT:
 // The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot

 let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
-// Round FP32 to BF16.
-def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
 }

 // ARMv8.6A AArch64 matrix multiplication
@@ -10410,9 +10408,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
  let Predicates = [HasBF16] in
  def : Pat<(InOp (v8bf16 V128:$Rn)),
            (v8bf16 (BFCVTN2
-              (v8bf16 (BFCVTN
-                (v4f32 (OutInst
-                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+              (INSERT_SUBREG (IMPLICIT_DEF),
+                (v4bf16 (BFCVTN
+                  (v4f32 (OutInst
+                    (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+                dsub),
              (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;

  let Predicates = [HasNoBF16] in
@@ -10447,10 +10447,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
  let Predicates = [HasBF16] in
  def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
            (v8bf16 (BFCVTN2
-              (v8bf16 (BFCVTN
-                (v4f32 (OutInst
-                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
-                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+              (INSERT_SUBREG (IMPLICIT_DEF),
+                (v4bf16 (BFCVTN
+                  (v4f32 (OutInst
+                    (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+                    (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+                dsub),
              (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
                              (v4f32 (SHLLv8i16 V128:$Rm))))))>;


diff --git a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s

+; This test acts to test the old neon.bfcvt intrinsics, which are now
+; autoupgraded to fptrunc operations.
+
 declare bfloat @llvm.aarch64.neon.bfcvt(float)
 declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
 declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
-Original file line number
+Diff line change
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
     let mayRaiseFPException = 1, Uses = [FPCR] in
     class SIMD_BFCVTN
-      : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+      : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
             Copy link

  
      
    
  

  
      

  
    Collaborator


      

  

  
    
      

      
            sjoerdmeijer
  

      

      

      


        Jan 20, 2025


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I haven't looked at how this change is used in the base class to be honest, but is my guess correct that due to this change, we see this line disappearing in llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll:
kill: def $d0 killed $d0 killed $q0

    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Collaborator


      

  Author


  

  
    
      

      
            davemgreen
  

      

      

      


        Jan 20, 2025


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Yes that sounds right. It is needed to make the patterns match properly, as a bfcvtn will naturally produce a 64bit vector, and v4bf16 is a 64bit vector. Other instructions that use SIMDMixedTwoVector like XTN use the same type.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
                                "bfcvtn", ".4h", ".4s",
-        [(set (v8bf16 V128:$Rd),
-              (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+        [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
     let mayRaiseFPException = 1, Uses = [FPCR] in
     class SIMD_BFCVTN2
       : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
-                               "bfcvtn2", ".8h", ".4s",
-        [(set (v8bf16 V128:$dst),
-              (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+                                   "bfcvtn2", ".8h", ".4s", []>;
     let mayRaiseFPException = 1, Uses = [FPCR] in
     class BF16ToSinglePrecision<string asm>
       : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
-        [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+        [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
         Sched<[WriteFCvt]> {
       bits<5> Rd;
       bits<5> Rn;
-          Expand Down