diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 13fb6a32233fe..1683ea157c784 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1453,6 +1453,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // FADDP custom lowering for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::FADD, VT, Custom); + + if (EnablePartialReduceNodes && Subtarget->hasDotProd()) { + setPartialReduceMLAAction(MVT::v4i32, MVT::v16i8, Legal); + setPartialReduceMLAAction(MVT::v2i32, MVT::v8i8, Legal); + setPartialReduceMLAAction(MVT::v2i64, MVT::v16i8, Custom); + } + } else /* !isNeonAvailable */ { for (MVT VT : MVT::fixedlen_vector_valuetypes()) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) @@ -29518,37 +29525,60 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op, } /// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing -/// of nxv2i64/nxv16i8, we cannot directly lower it to a (u|s)dot. We can +/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can /// however still make use of the dot product instruction by instead -/// accumulating over two steps: nxv16i8 -> nxv4i32 -> nxv2i64. +/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64. +/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise +/// the following pattern is emitted: +/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N, +/// NTy/2)))) SDValue AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const { + bool Scalable = Op.getValueType().isScalableVector(); + + assert((!Scalable || Subtarget->isSVEorStreamingSVEAvailable()) && + "SVE or StreamingSVE must be available when using scalable vectors."); + assert((Scalable || Subtarget->hasDotProd()) && + "Dotprod must be available when targeting NEON dot product " + "instructions."); + SDLoc DL(Op); SDValue Acc = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); EVT ResultVT = Op.getValueType(); - assert(ResultVT == MVT::nxv2i64 && LHS.getValueType() == MVT::nxv16i8); - SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, MVT::nxv4i32, - DAG.getConstant(0, DL, MVT::nxv4i32), LHS, RHS); + assert((Scalable && ResultVT == MVT::nxv2i64 && + LHS.getValueType() == MVT::nxv16i8) || + (!Scalable && ResultVT == MVT::v2i64 && + LHS.getValueType() == MVT::v16i8)); + + EVT DotVT = Scalable ? MVT::nxv4i32 : MVT::v4i32; + SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT, + DAG.getConstant(0, DL, DotVT), LHS, RHS); bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA; - if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) { + if (Scalable && + (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) { unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB; unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT; SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode); return DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode); } - unsigned LoOpcode = IsUnsigned ? AArch64ISD::UUNPKLO : AArch64ISD::SUNPKLO; - unsigned HiOpcode = IsUnsigned ? AArch64ISD::UUNPKHI : AArch64ISD::SUNPKHI; - auto Lo = DAG.getNode(LoOpcode, DL, ResultVT, DotNode); - auto Hi = DAG.getNode(HiOpcode, DL, ResultVT, DotNode); - auto Extended = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, Hi); - return DAG.getNode(ISD::ADD, DL, ResultVT, Acc, Extended); + // Fold (nx)v4i32 into (nx)v2i64 + auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL); + if (IsUnsigned) { + DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT); + DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT); + } else { + DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT); + DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT); + } + auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo); + return DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi); } SDValue diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b02a907f7439f..5cc6a38d55977 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1474,6 +1474,17 @@ defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", AArch64sdot>; defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", AArch64udot>; } +let Predicates = [HasNEON, HasDotProd] in { + def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$Acc), (v16i8 V128:$MulLHS), (v16i8 V128:$MulRHS))), + (v4i32 (UDOTv16i8 V128:$Acc, V128:$MulLHS, V128:$MulRHS))>; + def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$Acc), (v16i8 V128:$MulLHS), (v16i8 V128:$MulRHS))), + (v4i32 (SDOTv16i8 V128:$Acc, V128:$MulLHS, V128:$MulRHS))>; + def : Pat<(v2i32 (partial_reduce_umla (v2i32 V64:$Acc), (v8i8 V64:$MulLHS), (v8i8 V64:$MulRHS))), + (v2i32 (UDOTv8i8 V64:$Acc, V64:$MulLHS, V64:$MulRHS))>; + def : Pat<(v2i32 (partial_reduce_smla (v2i32 V64:$Acc), (v8i8 V64:$MulLHS), (v8i8 V64:$MulRHS))), + (v2i32 (SDOTv8i8 V64:$Acc, V64:$MulLHS, V64:$MulRHS))>; +} // End HasNEON, HasDotProd + // ARMv8.6-A BFloat let Predicates = [HasNEON, HasBF16] in { defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">; diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index ab9813aa796e3..2b68c963ad319 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM -; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM +; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-DOT-LABEL: udot: @@ -178,6 +178,18 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-I8MM: // %bb.0: ; CHECK-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: usdot: +; CHECK-NEWLOWERING-I8MM: // %bb.0: +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v4.4h, v3.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h +; CHECK-NEWLOWERING-I8MM-NEXT: ret %u.wide = zext <16 x i8> %u to <16 x i32> %s.wide = sext <16 x i8> %s to <16 x i32> %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide @@ -224,6 +236,29 @@ define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){ ; CHECK-I8MM-NEXT: b.ne .LBB6_1 ; CHECK-I8MM-NEXT: // %bb.2: // %end ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: mov x8, xzr +; CHECK-NEWLOWERING-I8MM-NEXT: .LBB6_1: // %vector.body +; CHECK-NEWLOWERING-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q2, [x0, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q3, [x1, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: mov v0.16b, v1.16b +; CHECK-NEWLOWERING-I8MM-NEXT: add x8, x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v5.8h, v3.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: cmp x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.4s, v4.4h, v5.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.4s, v2.4h, v3.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h +; CHECK-NEWLOWERING-I8MM-NEXT: b.ne .LBB6_1 +; CHECK-NEWLOWERING-I8MM-NEXT: // %bb.2: // %end +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: br label %vector.body @@ -268,6 +303,23 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-I8MM: // %bb.0: ; CHECK-I8MM-NEXT: usdot v0.2s, v1.8b, v2.8b ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: usdot_narrow: +; CHECK-NEWLOWERING-I8MM: // %bb.0: +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEWLOWERING-I8MM-NEXT: smull v3.4s, v2.4h, v1.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NEWLOWERING-I8MM-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: smull2 v1.4s, v2.8h, v1.8h +; CHECK-NEWLOWERING-I8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: add v0.2s, v3.2s, v0.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v5.4h, v4.4h +; CHECK-NEWLOWERING-I8MM-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NEWLOWERING-I8MM-NEXT: ret %u.wide = zext <8 x i8> %u to <8 x i32> %s.wide = sext <8 x i8> %s to <8 x i32> %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide @@ -292,6 +344,18 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{ ; CHECK-I8MM: // %bb.0: ; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: sudot: +; CHECK-NEWLOWERING-I8MM: // %bb.0: +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v4.4h, v3.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h +; CHECK-NEWLOWERING-I8MM-NEXT: ret %s.wide = sext <16 x i8> %u to <16 x i32> %u.wide = zext <16 x i8> %s to <16 x i32> %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide @@ -338,6 +402,29 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ ; CHECK-I8MM-NEXT: b.ne .LBB9_1 ; CHECK-I8MM-NEXT: // %bb.2: // %end ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: sudot_in_loop: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: mov x8, xzr +; CHECK-NEWLOWERING-I8MM-NEXT: .LBB9_1: // %vector.body +; CHECK-NEWLOWERING-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q2, [x0, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q3, [x1, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: mov v0.16b, v1.16b +; CHECK-NEWLOWERING-I8MM-NEXT: add x8, x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: cmp x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.4s, v4.4h, v5.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.4s, v2.4h, v3.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h +; CHECK-NEWLOWERING-I8MM-NEXT: b.ne .LBB9_1 +; CHECK-NEWLOWERING-I8MM-NEXT: // %bb.2: // %end +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: br label %vector.body @@ -382,6 +469,23 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-I8MM: // %bb.0: ; CHECK-I8MM-NEXT: usdot v0.2s, v2.8b, v1.8b ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: sudot_narrow: +; CHECK-NEWLOWERING-I8MM: // %bb.0: +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEWLOWERING-I8MM-NEXT: smull v3.4s, v2.4h, v1.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NEWLOWERING-I8MM-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: smull2 v1.4s, v2.8h, v1.8h +; CHECK-NEWLOWERING-I8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEWLOWERING-I8MM-NEXT: add v0.2s, v3.2s, v0.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v5.4h, v4.4h +; CHECK-NEWLOWERING-I8MM-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NEWLOWERING-I8MM-NEXT: ret %u.wide = sext <8 x i8> %u to <8 x i32> %s.wide = zext <8 x i8> %s to <8 x i32> %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide @@ -390,14 +494,6 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ } define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { -; CHECK-DOT-LABEL: udot_8to64: -; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 -; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b -; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s -; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-DOT-NEXT: ret -; ; CHECK-NODOT-LABEL: udot_8to64: ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b @@ -415,6 +511,22 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s ; CHECK-NODOT-NEXT: ret +; +; CHECK-I8MM-LABEL: udot_8to64: +; CHECK-I8MM: // %bb.0: // %entry +; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 +; CHECK-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b +; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s +; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: udot_8to64: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v4.2s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v4.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: %a.wide = zext <16 x i8> %a to <16 x i64> %b.wide = zext <16 x i8> %b to <16 x i64> @@ -425,14 +537,6 @@ entry: } define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ -; CHECK-DOT-LABEL: sdot_8to64: -; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 -; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b -; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s -; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-DOT-NEXT: ret -; ; CHECK-NODOT-LABEL: sdot_8to64: ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b @@ -450,6 +554,22 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s ; CHECK-NODOT-NEXT: ret +; +; CHECK-I8MM-LABEL: sdot_8to64: +; CHECK-I8MM: // %bb.0: // %entry +; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 +; CHECK-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b +; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s +; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: sdot_8to64: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b +; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: %a.wide = sext <16 x i8> %a to <16 x i64> %b.wide = sext <16 x i8> %b to <16 x i64> @@ -491,6 +611,30 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s ; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: usdot_8to64: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v6.4s, v4.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v7.4s, v2.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v16.4s, v5.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v17.4s, v3.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v5.4s, v5.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.2d, v6.2s, v16.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.2d, v7.2s, v17.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.2d, v2.2s, v3.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: %a.wide = zext <16 x i8> %a to <16 x i64> %b.wide = sext <16 x i8> %b to <16 x i64> @@ -532,6 +676,30 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s ; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: sudot_8to64: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v5.8h, v3.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v6.4s, v4.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v7.4s, v2.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v16.4s, v5.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v17.4s, v3.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.2d, v6.2s, v16.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.2d, v7.2s, v17.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.2d, v2.2s, v3.2s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: %a.wide = sext <16 x i8> %a to <16 x i64> %b.wide = zext <16 x i8> %b to <16 x i64> @@ -563,22 +731,6 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ } define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ -; CHECK-DOT-LABEL: udot_no_bin_op_in_loop: -; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 -; CHECK-DOT-NEXT: movi v2.16b, #1 -; CHECK-DOT-NEXT: mov x8, xzr -; CHECK-DOT-NEXT: .LBB16_1: // %vector.body -; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-DOT-NEXT: ldr q3, [x0, x8] -; CHECK-DOT-NEXT: mov v0.16b, v1.16b -; CHECK-DOT-NEXT: add x8, x8, #16 -; CHECK-DOT-NEXT: cmp x8, #16 -; CHECK-DOT-NEXT: udot v1.4s, v3.16b, v2.16b -; CHECK-DOT-NEXT: b.ne .LBB16_1 -; CHECK-DOT-NEXT: // %bb.2: // %end -; CHECK-DOT-NEXT: ret -; ; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop: ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 @@ -598,7 +750,52 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ ; CHECK-NODOT-NEXT: b.ne .LBB16_1 ; CHECK-NODOT-NEXT: // %bb.2: // %end ; CHECK-NODOT-NEXT: ret - +; +; CHECK-I8MM-LABEL: udot_no_bin_op_in_loop: +; CHECK-I8MM: // %bb.0: // %entry +; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000 +; CHECK-I8MM-NEXT: movi v2.16b, #1 +; CHECK-I8MM-NEXT: mov x8, xzr +; CHECK-I8MM-NEXT: .LBB16_1: // %vector.body +; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-I8MM-NEXT: ldr q3, [x0, x8] +; CHECK-I8MM-NEXT: mov v0.16b, v1.16b +; CHECK-I8MM-NEXT: add x8, x8, #16 +; CHECK-I8MM-NEXT: cmp x8, #16 +; CHECK-I8MM-NEXT: udot v1.4s, v3.16b, v2.16b +; CHECK-I8MM-NEXT: b.ne .LBB16_1 +; CHECK-I8MM-NEXT: // %bb.2: // %end +; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_in_loop: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEWLOWERING-I8MM-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: adrp x9, .LCPI16_2 +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEWLOWERING-I8MM-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEWLOWERING-I8MM-NEXT: adrp x10, .LCPI16_3 +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q3, [x8, :lo12:.LCPI16_1] +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q4, [x9, :lo12:.LCPI16_2] +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q5, [x10, :lo12:.LCPI16_3] +; CHECK-NEWLOWERING-I8MM-NEXT: mov x8, xzr +; CHECK-NEWLOWERING-I8MM-NEXT: .LBB16_1: // %vector.body +; CHECK-NEWLOWERING-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q6, [x0, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: mov v0.16b, v2.16b +; CHECK-NEWLOWERING-I8MM-NEXT: add x8, x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: cmp x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: tbl v7.16b, { v6.16b }, v3.16b +; CHECK-NEWLOWERING-I8MM-NEXT: tbl v16.16b, { v6.16b }, v4.16b +; CHECK-NEWLOWERING-I8MM-NEXT: tbl v17.16b, { v6.16b }, v5.16b +; CHECK-NEWLOWERING-I8MM-NEXT: tbl v6.16b, { v6.16b }, v1.16b +; CHECK-NEWLOWERING-I8MM-NEXT: add v2.4s, v2.4s, v17.4s +; CHECK-NEWLOWERING-I8MM-NEXT: add v7.4s, v16.4s, v7.4s +; CHECK-NEWLOWERING-I8MM-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEWLOWERING-I8MM-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEWLOWERING-I8MM-NEXT: b.ne .LBB16_1 +; CHECK-NEWLOWERING-I8MM-NEXT: // %bb.2: // %end +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: br label %vector.body @@ -691,15 +888,6 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ } define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ -; CHECK-DOT-LABEL: udot_no_bin_op_8to64: -; CHECK-DOT: // %bb.0: -; CHECK-DOT-NEXT: movi v3.16b, #1 -; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 -; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b -; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s -; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-DOT-NEXT: ret -; ; CHECK-NODOT-LABEL: udot_no_bin_op_8to64: ; CHECK-NODOT: // %bb.0: ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 @@ -717,21 +905,39 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s ; CHECK-NODOT-NEXT: ret +; +; CHECK-I8MM-LABEL: udot_no_bin_op_8to64: +; CHECK-I8MM: // %bb.0: +; CHECK-I8MM-NEXT: movi v3.16b, #1 +; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 +; CHECK-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b +; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s +; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64: +; CHECK-NEWLOWERING-I8MM: // %bb.0: +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v3.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v4.4s, v3.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v1.2d, v1.2d, v5.2s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v4.2s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v4.4s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v1.2d, v1.2d, v2.2s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw v0.2d, v0.2d, v3.2s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v1.2d, v1.2d, v2.4s +; CHECK-NEWLOWERING-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v3.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret %a.wide = zext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) ret <4 x i64> %partial.reduce } define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ -; CHECK-DOT-LABEL: sdot_no_bin_op_8to64: -; CHECK-DOT: // %bb.0: -; CHECK-DOT-NEXT: movi v3.16b, #1 -; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 -; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b -; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s -; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-DOT-NEXT: ret -; ; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64: ; CHECK-NODOT: // %bb.0: ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0 @@ -749,6 +955,33 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s ; CHECK-NODOT-NEXT: ret +; +; CHECK-I8MM-LABEL: sdot_no_bin_op_8to64: +; CHECK-I8MM: // %bb.0: +; CHECK-I8MM-NEXT: movi v3.16b, #1 +; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 +; CHECK-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b +; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s +; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64: +; CHECK-NEWLOWERING-I8MM: // %bb.0: +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v3.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v4.4s, v3.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v5.4s, v2.4h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: saddw v1.2d, v1.2d, v5.2s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw v1.2d, v1.2d, v2.2s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw v0.2d, v0.2d, v3.2s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v1.2d, v1.2d, v2.4s +; CHECK-NEWLOWERING-I8MM-NEXT: saddw2 v0.2d, v0.2d, v3.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret %a.wide = sext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) ret <4 x i64> %partial.reduce @@ -920,6 +1153,37 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-I8MM-NEXT: // %bb.2: // %end ; CHECK-I8MM-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-I8MM-NEXT: ret +; +; CHECK-NEWLOWERING-I8MM-LABEL: usdot_multiple_zext_users: +; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry +; CHECK-NEWLOWERING-I8MM-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEWLOWERING-I8MM-NEXT: mov x8, xzr +; CHECK-NEWLOWERING-I8MM-NEXT: .LBB28_1: // %vector.body +; CHECK-NEWLOWERING-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q2, [x0, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q3, [x1, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: ldr q4, [x2, x8] +; CHECK-NEWLOWERING-I8MM-NEXT: add x8, x8, #16 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll v6.8h, v4.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: ushll2 v4.8h, v4.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NEWLOWERING-I8MM-NEXT: cmp x8, #1024 +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v5.4h, v6.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.4s, v7.4h, v6.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.4s, v5.8h, v6.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.4s, v7.8h, v6.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v0.4s, v2.4h, v4.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal v1.4s, v3.4h, v4.4h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v0.4s, v2.8h, v4.8h +; CHECK-NEWLOWERING-I8MM-NEXT: smlal2 v1.4s, v3.8h, v4.8h +; CHECK-NEWLOWERING-I8MM-NEXT: b.ne .LBB28_1 +; CHECK-NEWLOWERING-I8MM-NEXT: // %bb.2: // %end +; CHECK-NEWLOWERING-I8MM-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEWLOWERING-I8MM-NEXT: ret entry: br label %vector.body @@ -948,3 +1212,20 @@ end: %2 = add <4 x i32> %psum2, %psum1 ret <4 x i32> %2 } + +define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){ +; CHECK-LABEL: udot_16to64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v2.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s +; CHECK-NEXT: uaddw2 v0.2d, v0.2d, v2.4s +; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-NEXT: uaddw2 v0.2d, v0.2d, v1.4s +; CHECK-NEXT: ret +entry: + %input.wide = zext <8 x i16> %input to <8 x i64> + %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide) + ret <2 x i64> %partial.reduce +} + diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 5bc9a101b1e44..809a45045b0db 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -202,10 +202,10 @@ define @udot_8to64( %acc, @sdot_8to64( %acc,