Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 94bf348

Browse filesBrowse files
JamesChestermanNickGuy-Arm
authored andcommitted
[AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA to USDOT
Add lowering for PARTIAL_REDUCE_U/SMLA nodes to USDOT instructions. This happens when there is a MUL instruction as the second operand in the ISD node. Then the extends on the operands of the MUL op need to have a different signedness.
1 parent 5b0cd17 commit 94bf348
Copy full SHA for 94bf348

File tree

4 files changed

+109
-146
lines changed
Filter options

4 files changed

+109
-146
lines changed

‎llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp

Copy file name to clipboardExpand all lines: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+13-2Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -924,8 +924,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
924924
/// illegal ResNo in that case.
925925
bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
926926
// See if the target wants to custom lower this node.
927-
if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
928-
return false;
927+
unsigned Opcode = N->getOpcode();
928+
bool IsPRMLAOpcode =
929+
Opcode == ISD::PARTIAL_REDUCE_UMLA || Opcode == ISD::PARTIAL_REDUCE_SMLA;
930+
931+
if (IsPRMLAOpcode) {
932+
if (TLI.getPartialReduceMLAAction(N->getValueType(0),
933+
N->getOperand(1).getValueType()) !=
934+
TargetLowering::Custom)
935+
return false;
936+
} else {
937+
if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
938+
return false;
939+
}
929940

930941
SmallVector<SDValue, 8> Results;
931942
if (LegalizeResult)

‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Copy file name to clipboardExpand all lines: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+81Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7756,6 +7756,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
77567756
return LowerFLDEXP(Op, DAG);
77577757
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
77587758
return LowerVECTOR_HISTOGRAM(Op, DAG);
7759+
case ISD::PARTIAL_REDUCE_UMLA:
7760+
case ISD::PARTIAL_REDUCE_SMLA:
7761+
return LowerPARTIAL_REDUCE_MLA(Op, DAG);
77597762
}
77607763
}
77617764

@@ -27560,6 +27563,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
2756027563
if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
2756127564
Results.push_back(Res);
2756227565
return;
27566+
case ISD::PARTIAL_REDUCE_UMLA:
27567+
case ISD::PARTIAL_REDUCE_SMLA:
27568+
Results.push_back(LowerPARTIAL_REDUCE_MLA(SDValue(N, 0), DAG));
27569+
return;
2756327570
case ISD::ADD:
2756427571
case ISD::FADD:
2756527572
ReplaceAddWithADDP(N, Results, DAG, Subtarget);
@@ -29506,6 +29513,80 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
2950629513
return Scatter;
2950729514
}
2950829515

29516+
// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(MulOpLHS), SEXT(MulOpRHS)), Splat 1)
29517+
// to USDOT(Acc, MulOpLHS, MulOpRHS)
29518+
// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(MulOpLHS), ZEXT(MulOpRHS)), Splat 1)
29519+
// to USDOT(Acc, MulOpRHS, MulOpLHS)
29520+
SDValue
29521+
AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
29522+
SelectionDAG &DAG) const {
29523+
bool Scalable = Op.getValueType().isScalableVector();
29524+
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29525+
if (Scalable && !Subtarget.isSVEorStreamingSVEAvailable())
29526+
return SDValue();
29527+
if (!Scalable && (!Subtarget.isNeonAvailable() || !Subtarget.hasDotProd()))
29528+
return SDValue();
29529+
if (!Subtarget.hasMatMulInt8())
29530+
return SDValue();
29531+
SDLoc DL(Op);
29532+
29533+
if (Op.getOperand(1).getOpcode() != ISD::MUL)
29534+
return SDValue();
29535+
29536+
SDValue Acc = Op.getOperand(0);
29537+
SDValue Mul = Op.getOperand(1);
29538+
29539+
APInt ConstantOne;
29540+
if (!ISD::isConstantSplatVector(Op.getOperand(2).getNode(), ConstantOne) ||
29541+
!ConstantOne.isOne())
29542+
return SDValue();
29543+
29544+
SDValue ExtMulOpLHS = Mul.getOperand(0);
29545+
SDValue ExtMulOpRHS = Mul.getOperand(1);
29546+
unsigned ExtMulOpLHSOpcode = ExtMulOpLHS.getOpcode();
29547+
unsigned ExtMulOpRHSOpcode = ExtMulOpRHS.getOpcode();
29548+
if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
29549+
!ISD::isExtOpcode(ExtMulOpRHSOpcode))
29550+
return SDValue();
29551+
29552+
SDValue MulOpLHS = ExtMulOpLHS.getOperand(0);
29553+
SDValue MulOpRHS = ExtMulOpRHS.getOperand(0);
29554+
EVT MulOpLHSVT = MulOpLHS.getValueType();
29555+
if (MulOpLHSVT != MulOpRHS.getValueType())
29556+
return SDValue();
29557+
29558+
bool LHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
29559+
bool RHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
29560+
if (LHSIsSigned == RHSIsSigned)
29561+
return SDValue();
29562+
29563+
EVT AccVT = Acc.getValueType();
29564+
// There is no nxv2i64 version of usdot
29565+
if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
29566+
return SDValue();
29567+
29568+
// USDOT expects the signed operand to be last
29569+
if (!RHSIsSigned)
29570+
std::swap(MulOpLHS, MulOpRHS);
29571+
29572+
unsigned Opcode = AArch64ISD::USDOT;
29573+
// Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
29574+
// product followed by a zero / sign extension
29575+
// Don't want this to be split because there is no nxv2i64 version of usdot
29576+
if ((AccVT == MVT::nxv4i64 && MulOpLHSVT == MVT::nxv16i8) ||
29577+
(AccVT == MVT::v4i64 && MulOpLHSVT == MVT::v16i8)) {
29578+
EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
29579+
29580+
SDValue DotI32 =
29581+
DAG.getNode(Opcode, DL, AccVTI32, DAG.getConstant(0, DL, AccVTI32),
29582+
MulOpLHS, MulOpRHS);
29583+
SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
29584+
return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
29585+
}
29586+
29587+
return DAG.getNode(Opcode, DL, AccVT, Acc, MulOpLHS, MulOpRHS);
29588+
}
29589+
2950929590
SDValue
2951029591
AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
2951129592
SelectionDAG &DAG) const {

‎llvm/lib/Target/AArch64/AArch64ISelLowering.h

Copy file name to clipboardExpand all lines: llvm/lib/Target/AArch64/AArch64ISelLowering.h
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,6 +1181,7 @@ class AArch64TargetLowering : public TargetLowering {
11811181
SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11821182
SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11831183
SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
1184+
SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
11841185
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
11851186
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
11861187
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;

‎llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+14-144Lines changed: 14 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -106,25 +106,7 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
106106
;
107107
; CHECK-NEWLOWERING-LABEL: usdot:
108108
; CHECK-NEWLOWERING: // %bb.0: // %entry
109-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
110-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
111-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
112-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
113-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
114-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
115-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
116-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
117-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
118-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
119-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
120-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
121-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
122-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
123-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
124-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
125-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
126-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
127-
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
109+
; CHECK-NEWLOWERING-NEXT: usdot z0.s, z1.b, z2.b
128110
; CHECK-NEWLOWERING-NEXT: ret
129111
entry:
130112
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -165,25 +147,7 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
165147
;
166148
; CHECK-NEWLOWERING-LABEL: sudot:
167149
; CHECK-NEWLOWERING: // %bb.0: // %entry
168-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
169-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
170-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
171-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
172-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
173-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
174-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
175-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
176-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
177-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
178-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
179-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
180-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
181-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
182-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
183-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
184-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
185-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
186-
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
150+
; CHECK-NEWLOWERING-NEXT: usdot z0.s, z2.b, z1.b
187151
; CHECK-NEWLOWERING-NEXT: ret
188152
entry:
189153
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -415,59 +379,12 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
415379
;
416380
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
417381
; CHECK-NEWLOWERING: // %bb.0: // %entry
418-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
419-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
420-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
421-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
422-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
423-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
424-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
425-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
426-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
427-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
428-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
429-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
430-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
431-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
432-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
433-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
434-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
435-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
436-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
437-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
438-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
439-
; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
440-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
441-
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
442-
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
443-
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
444-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
445-
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
446-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
447-
; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
448-
; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
449-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
450-
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
451-
; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
452-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
453-
; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
454-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
455-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
456-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
457-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
458-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
459-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
460-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
461-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
462-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
463-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
464-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
465-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
466-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
467-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
468-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
469-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
470-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
382+
; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
383+
; CHECK-NEWLOWERING-NEXT: usdot z4.s, z2.b, z3.b
384+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
385+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
386+
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
387+
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
471388
; CHECK-NEWLOWERING-NEXT: ret
472389
entry:
473390
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -548,59 +465,12 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
548465
;
549466
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
550467
; CHECK-NEWLOWERING: // %bb.0: // %entry
551-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
552-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
553-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
554-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
555-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
556-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
557-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
558-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
559-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
560-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
561-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
562-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
563-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
564-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
565-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
566-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
567-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
568-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
569-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
570-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
571-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
572-
; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
573-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
574-
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
575-
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
576-
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
577-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
578-
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
579-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
580-
; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
581-
; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
582-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
583-
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
584-
; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
585-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
586-
; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
587-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
588-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
589-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
590-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
591-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
592-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
593-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
594-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
595-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
596-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
597-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
598-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
599-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
600-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
601-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
602-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
603-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
468+
; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
469+
; CHECK-NEWLOWERING-NEXT: usdot z4.s, z3.b, z2.b
470+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
471+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
472+
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
473+
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
604474
; CHECK-NEWLOWERING-NEXT: ret
605475
entry:
606476
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.