diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 22f23e4c94e2d..8ca36e038419e 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include +#include using namespace llvm; @@ -35,6 +36,11 @@ using namespace llvm; STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); STATISTIC(NumSDWAInstructionsPeepholed, "Number of instruction converted to SDWA."); +STATISTIC(Num16BitPackedInstructionsEliminated, + "Number of packed instruction eliminated."); +STATISTIC(NumSDWAInstructionsToEliminateFP16Pack, + "Number of instruction converted/modified into SDWA to eliminate " + "FP16 packing."); namespace { @@ -66,6 +72,14 @@ class SIPeepholeSDWA { bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; + void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST); + unsigned + computeMIChainsForPackedOps(MachineInstr *ParentMI, + std::queue &DefSrcQueue, + const GCNSubtarget &ST); + void convertMIToSDWAWithOpsel(MachineInstr *MI, MachineOperand &SrcMO, + AMDGPU::SDWA::SdwaSel OpSel); + public: bool run(MachineFunction &MF); }; @@ -1361,6 +1375,459 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { return SIPeepholeSDWA().run(MF); } +static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) { + unsigned Opcode = MI->getOpcode(); + if (TII->isSDWA(Opcode)) + Opcode = AMDGPU::getBasicFromSDWAOp(Opcode); + + switch (Opcode) { + case AMDGPU::V_CVT_F16_U16_e32: + case AMDGPU::V_CVT_F16_U16_e64: + case AMDGPU::V_CVT_F16_I16_e32: + case AMDGPU::V_CVT_F16_I16_e64: + case AMDGPU::V_RCP_F16_e64: + case AMDGPU::V_RCP_F16_e32: + case AMDGPU::V_RSQ_F16_e64: + case AMDGPU::V_RSQ_F16_e32: + case AMDGPU::V_SQRT_F16_e64: + case AMDGPU::V_SQRT_F16_e32: + case AMDGPU::V_LOG_F16_e64: + case AMDGPU::V_LOG_F16_e32: + case AMDGPU::V_EXP_F16_e64: + case AMDGPU::V_EXP_F16_e32: + case AMDGPU::V_SIN_F16_e64: + case AMDGPU::V_SIN_F16_e32: + case AMDGPU::V_COS_F16_e64: + case AMDGPU::V_COS_F16_e32: + case AMDGPU::V_FLOOR_F16_e64: + case AMDGPU::V_FLOOR_F16_e32: + case AMDGPU::V_CEIL_F16_e64: + case AMDGPU::V_CEIL_F16_e32: + case AMDGPU::V_TRUNC_F16_e64: + case AMDGPU::V_TRUNC_F16_e32: + case AMDGPU::V_RNDNE_F16_e64: + case AMDGPU::V_RNDNE_F16_e32: + case AMDGPU::V_FRACT_F16_e64: + case AMDGPU::V_FRACT_F16_e32: + case AMDGPU::V_FREXP_MANT_F16_e64: + case AMDGPU::V_FREXP_MANT_F16_e32: + case AMDGPU::V_FREXP_EXP_I16_F16_e64: + case AMDGPU::V_FREXP_EXP_I16_F16_e32: + case AMDGPU::V_LDEXP_F16_e64: + case AMDGPU::V_LDEXP_F16_e32: + case AMDGPU::V_ADD_F16_e64: + case AMDGPU::V_ADD_F16_e32: + case AMDGPU::V_SUB_F16_e64: + case AMDGPU::V_SUB_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_MUL_F16_e64: + case AMDGPU::V_MUL_F16_e32: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F16_e32: + case AMDGPU::V_MIN_F16_e64: + case AMDGPU::V_MIN_F16_e32: + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_DIV_FIXUP_F16_e64: + return true; + case AMDGPU::V_MADAK_F16: + case AMDGPU::V_MADMK_F16: + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAAK_F16: + // NOTE : SKEPTICAL ABOUT IT + return false; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + // As their sdwa version allow dst_sel to be equal only set to DWORD + default: + return false; + } +} + +static bool checkForRightSrcRootAccess(MachineInstr *Def0MI, + MachineInstr *Def1MI, + Register SrcRootReg, + const SIInstrInfo *TII) { + // As if could, the Def1MI would have been sdwa-ed in order to access + // upper half, and Def0MI should not be as it accessing lower half. + if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode())) + return false; + + // Def1 should be writing into entire DWORD of dst, with unused part set + // to zero-pad. + MachineOperand *Def1DstSel = + TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel); + if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD) + return false; + MachineOperand *Def1DstUnused = + TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused); + if (!Def1DstUnused || + Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD) + return false; + + const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName, + AMDGPU::OpName SrcSelName, + AMDGPU::SDWA::SdwaSel SdwaSel) -> bool { + MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName); + if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) { + MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName); + if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) { + if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel) + return true; + } else { + assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 && + "Not valid SDWA SrcSel operand"); + if (DefSrcSel && DefSrcSel->getImm() == SdwaSel) + return true; + } + } + return false; + }; + + if (!CheckSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel, + AMDGPU::SDWA::SdwaSel::WORD_1) && + !CheckSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel, + AMDGPU::SDWA::SdwaSel::WORD_1)) + return false; + + return CheckSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel, + AMDGPU::SDWA::SdwaSel::WORD_0) || + CheckSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel, + AMDGPU::SDWA::SdwaSel::WORD_0); +} + +/// Given A and B are in the same MBB, returns true if A comes before B. +static bool dominates(MachineBasicBlock::const_iterator A, + MachineBasicBlock::const_iterator B) { + assert(A->getParent() == B->getParent()); + const MachineBasicBlock *MBB = A->getParent(); + auto MBBEnd = MBB->end(); + if (B == MBBEnd) + return true; + + if (A == MBBEnd) + return false; + + MachineBasicBlock::const_iterator I = A; + while (I != B && I != MBBEnd) + I++; + + return (I == B); +} + +// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel +// and preserving the rest of Dst's bits. +void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI, + MachineOperand &SrcMO, + AMDGPU::SDWA::SdwaSel OpSel) { + LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); + + if (!TII->isSDWA(MI->getOpcode())) { + MachineInstr *SDWAInst = createSDWAVersion(*MI); + MI->eraseFromParent(); + MI = SDWAInst; + } + + ConvertedInstructions.push_back(MI); + unsigned SDWAOpcode = MI->getOpcode(); + ++NumSDWAInstructionsToEliminateFP16Pack; + + MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); + + MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel); + assert(DstSel && + AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)); + DstSel->setImm(OpSel); + + MachineOperand *DstUnused = + TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused); + assert(DstUnused && + AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)); + assert(DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE && + "Dst_unused should not be UNUSED_PRESERVE already"); + DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE); + + int PreserveDstIdx = + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); + assert(PreserveDstIdx != -1); + MachineOperand NewSrcImplitMO = + MachineOperand::CreateReg(SrcMO.getReg(), false, true); + copyRegOperand(NewSrcImplitMO, SrcMO); + MI->addOperand(NewSrcImplitMO); + MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1); + + auto ModifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName, + AMDGPU::OpName SrcSelName) -> bool { + MachineOperand *Src = TII->getNamedOperand(*MI, SrcName); + assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName)); + if (Src->isReg() && (Src->getReg() == SrcMO.getReg())) { + MachineOperand *SrcSel = TII->getNamedOperand(*MI, SrcSelName); + assert(SrcSel && AMDGPU::hasNamedOperand(SDWAOpcode, SrcSelName)); + SrcSel->setImm(OpSel); + + LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n'); + return true; + } + + return false; + }; + + if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel)) + return; + + if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel)) + return; +} + +// BackTracks the given Parent MI to look for any of its use operand that has +// been defined by FP16 (sdwa-able) in recursive fashion. +unsigned SIPeepholeSDWA::computeMIChainsForPackedOps( + MachineInstr *ParentMI, std::queue &DefSrcQueue, + const GCNSubtarget &ST) { + unsigned NumOfFP16Def; + + // We will go up the use-def chain for ParentMI, until we encounter the + // exit condition, where we don't find any such defs of use operands + // which satisfy convertibility to SDWA OR find such uses more than 1 as now + // we don't know which path to follow-up. + do { + NumOfFP16Def = 0; + MachineInstr *NextMIInChain = nullptr; + for (MachineOperand &CurrentMO : ParentMI->uses()) { + if (!CurrentMO.isReg() || CurrentMO.getReg().isPhysical() || + !MRI->hasOneUse(CurrentMO.getReg())) + continue; + + MachineOperand *DefCurrMO = findSingleRegDef(&CurrentMO, MRI); + if (!DefCurrMO) + continue; + + MachineInstr *DefCurrMI = DefCurrMO->getParent(); + if (!isSrcDestFP16Bits(DefCurrMI, TII) || + !isConvertibleToSDWA(*DefCurrMI, ST, TII)) + continue; + + NextMIInChain = DefCurrMI; + DefSrcQueue.push(DefCurrMO); + NumOfFP16Def++; + } + + ParentMI = NextMIInChain; + } while (NumOfFP16Def == 1); + + return NumOfFP16Def; +} + +void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB, + const GCNSubtarget &ST) { + if (!ST.has16BitInsts()) + return; + + for (MachineInstr &MI : make_early_inc_range(MBB)) { + if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64) + continue; + LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n'); + + std::queue DefSrc0Queue; + std::queue DefSrc1Queue; + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + + if (!Src0->isReg() || Src0->getReg().isPhysical() || + !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() || + Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg())) + continue; + + MachineOperand *Op0 = findSingleRegDef(Src0, MRI); + MachineOperand *Op1 = findSingleRegDef(Src1, MRI); + + if (!Op0 || !Op1) + continue; + + MachineInstr *ParentMIOp0 = Op0->getParent(); + MachineInstr *ParentMIOp1 = Op1->getParent(); + + if (!isSrcDestFP16Bits(ParentMIOp0, TII) || + !isSrcDestFP16Bits(ParentMIOp1, TII) || + !isConvertibleToSDWA(*ParentMIOp0, ST, TII) || + !isConvertibleToSDWA(*ParentMIOp1, ST, TII)) + continue; + + DefSrc0Queue.push(Op0); + DefSrc1Queue.push(Op1); + + // This checks for the given MI, that it only has exact one register MO + // use , that is defined by pure FP16 instruction (that is SDWA-able too) + unsigned NumOfFP16Def; + + NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST); + if (NumOfFP16Def > 1) + continue; + + NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST); + if (NumOfFP16Def > 1) + continue; + + MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent(); + MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent(); + Register SrcRootMOReg = AMDGPU::NoRegister; + + // Now, check if the last operation for each in of the DefSrcQueue + // has the common MO, that would be the source root MO for element-wise + // fp16 chain operations + for (MachineOperand &Current0MO : Def0RootMI->uses()) { + if (!Current0MO.isReg() || Current0MO.getReg().isPhysical()) + continue; + + for (MachineOperand &Current1MO : Def1RootMI->uses()) { + if (!Current1MO.isReg() || Current1MO.getReg().isPhysical()) + continue; + + if (Current0MO.getReg() == Current1MO.getReg() && + Current0MO.getSubReg() == Current1MO.getSubReg()) { + SrcRootMOReg = Current0MO.getReg(); + break; + } + } + // Found it, no more check needed, so break; + if (SrcRootMOReg != AMDGPU::NoRegister) + break; + } + + if (SrcRootMOReg == AMDGPU::NoRegister) + continue; + + // Also we need to ensure that each of the DefXRootMI should access the + // lower and upper half word of SrcRootMOReg respectively. + if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg, TII)) + continue; + + // The graph below represents the connection : + // Op0Intial --> Op0x --> ... --> Op0Final + // / \' + // SrcRootMO v_Pack_b32_f16 + // \ / + // Op1Intial --> Op1x --> ... --> Op1Final + // The nomenclature is based upon above flow-graph + // + // Also for each of DefSrcXQueue : + // OpXIntial is at back & OpXFinal is at front + auto Op0FinalMI = (DefSrc0Queue.front())->getParent(); + auto Op1FinalMI = (DefSrc1Queue.front())->getParent(); + auto Op0IntialMI = (DefSrc0Queue.back())->getParent(); + auto Op1IntialMI = (DefSrc1Queue.back())->getParent(); + + MachineOperand *FinalOutMO = nullptr; + std::queue ChainedDefOps; + AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD; + int NumOfElemInSecondOpChain = 0; + + auto canonicalizedMIFlow = + [&](std::queue DefFromQueue, + std::queue DefToQueue) -> void { + MachineInstr *OpToIntialMI = (DefToQueue.back())->getParent(); + int OpIdx = OpToIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI); + auto &MOTo = OpToIntialMI->getOperand(OpIdx); + auto MOFrom = DefFromQueue.front(); + copyRegOperand(MOTo, *MOFrom); + + LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *OpToIntialMI << '\n'); + + FinalOutMO = DefToQueue.front(); + MachineInstr *OpFromIntialMI = (DefFromQueue.back())->getParent(); + OpIdx = OpFromIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI); + auto &IntialInMO = OpFromIntialMI->getOperand(OpIdx); + + while (!DefToQueue.empty()) { + ChainedDefOps.push(DefToQueue.front()); + DefToQueue.pop(); + NumOfElemInSecondOpChain++; + } + while (!DefFromQueue.empty()) { + ChainedDefOps.push(DefFromQueue.front()); + DefFromQueue.pop(); + } + ChainedDefOps.push(&IntialInMO); + }; + + // Now, we will change the flow as per the dominace of MI as follows, if + // possible and store it in ChainedDefOps, so later can be used to convert + // into its SDWA version: + // + // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE + // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final + // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO) + // + // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE + // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final + // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO) + // + // TODO : Else, not handled! + // One such case is observed when multiple fp16 instruction are chained + // on a fp16 vector input. For Example : + // + // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0) + // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1) + // return <2 x half> %res + if (dominates(Op0FinalMI, Op1IntialMI)) { + canonicalizedMIFlow(DefSrc0Queue, DefSrc1Queue); + OpSel = AMDGPU::SDWA::SdwaSel::WORD_1; + } else if (dominates(Op1FinalMI, Op0IntialMI)) { + canonicalizedMIFlow(DefSrc1Queue, DefSrc0Queue); + OpSel = AMDGPU::SDWA::SdwaSel::WORD_0; + } else { + LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n'); + continue; + } + + // Replace all use places of MI(v_pack) defMO with FinalOutMO. + MachineOperand &DefMO = MI.getOperand(0); + for (MachineOperand &MO : + make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) { + if (!MO.isReg()) + continue; + + MO.setReg(FinalOutMO->getReg()); + MO.setSubReg(FinalOutMO->getSubReg()); + } + LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI << "With " + << *FinalOutMO << '\n'); + + // Delete v_pack machine instruction + LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n"); + MI.eraseFromParent(); + ++Num16BitPackedInstructionsEliminated; + + // Convert machine instruction into SDWA-version + while (ChainedDefOps.size() != 1) { + if (NumOfElemInSecondOpChain == 0) { + if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0) + OpSel = AMDGPU::SDWA::SdwaSel::WORD_1; + else + OpSel = AMDGPU::SDWA::SdwaSel::WORD_0; + } + + MachineInstr *DefMI = ChainedDefOps.front()->getParent(); + ChainedDefOps.pop(); + MachineOperand *SrcMO = ChainedDefOps.front(); + + // Take SrcMO (which are def) as its usage in DefMI + if (SrcMO->isDef()) { + assert(MRI->hasOneUse(SrcMO->getReg())); + SrcMO = findSingleRegUse(SrcMO, MRI); + assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI"); + } + + convertMIToSDWAWithOpsel(DefMI, *SrcMO, OpSel); + NumOfElemInSecondOpChain--; + } + } +} + bool SIPeepholeSDWA::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); @@ -1418,6 +1885,12 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) { while (!ConvertedInstructions.empty()) legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); } while (Changed); + + // Process each v_pack_b32_fp16 instruction in MBB. + eliminateFP16Packing(MBB, ST); + Ret |= !ConvertedInstructions.empty(); + while (!ConvertedInstructions.empty()) + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); } return Ret; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll index e8e29c3d4b526..e7cc28eeabffb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -545,12 +545,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -565,12 +563,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul: @@ -578,12 +574,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -598,12 +592,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -644,12 +636,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -664,12 +656,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul_rhs: @@ -677,12 +669,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -697,12 +689,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll index 70f961e2777af..ad11c9b5f28ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -221,12 +221,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -241,12 +239,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul: @@ -254,12 +250,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -274,12 +268,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 5ba036c386a40..56827df6f027c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -1101,21 +1101,21 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fdiv_v2f16_afn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_afn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_afn: @@ -2782,17 +2782,15 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_arcp: @@ -2834,17 +2832,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) { ; GFX9-LABEL: v_rcp_v2f16_arcp_afn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp_afn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_arcp_afn: @@ -3192,21 +3188,21 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -3310,21 +3306,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -3372,21 +3368,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index c43731893c2d7..12d7f9d4af8c4 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -2188,17 +2188,15 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-LABEL: v_rsq_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_rsq_f16_e32 v0, v0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rsq_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_rsq_f16_e32 v0, v0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_rsq_v2f16: @@ -2398,17 +2396,15 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-LABEL: v_neg_rsq_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_rsq_f16_e32 v0, v0 -; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1 +; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rsq_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_rsq_f16_e32 v0, v0 -; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1 +; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index 63ba18a5433aa..68a3db1472aa2 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -187,22 +187,18 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; GFX10-LABEL: fmul_pow2_8xhalf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3 -; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2 -; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1 -; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1 -; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2 -; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1] @@ -302,18 +298,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000 -; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3 -; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2 -; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1 -; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0 -; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1 -; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2 -; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3 +; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf: @@ -1085,9 +1077,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] -; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index d957ba93e4fb3..1ca358b90c58a 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1587,9 +1587,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX8-LABEL: basic_fract_v2f16_nonan: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_fract_f16_e32 v1, v0 -; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan: @@ -2610,15 +2609,15 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s6, 0x204 -; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_floor_f16_e32 v4, v0 ; GFX8-NEXT: v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_pack_b32_f16 v3, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 ; GFX8-NEXT: v_fract_f16_e32 v4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6 +; GFX8-NEXT: v_floor_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5] +; GFX8-NEXT: v_floor_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX8-NEXT: v_pack_b32_f16 v0, v0, v5 ; GFX8-NEXT: global_store_dword v[1:2], v3, off ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 8c5bc4a33a303..a8ddc564e4b51 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -188,11 +188,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 -; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cos_f16_e32 v2, v3 -; GFX9-NEXT: v_cos_f16_e32 v1, v1 -; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -204,11 +203,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 -; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_cos_f16_e32 v2, v3 -; GFX10-NEXT: v_cos_f16_e32 v1, v1 -; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index fdccacf372dfa..a3ee9655f40a4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -6719,9 +6719,8 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5 ; GFX900-SDAG-NEXT: v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0] -; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v0 -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp_v2f16_fast: @@ -6904,13 +6903,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0 -; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1 ; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp_v3f16_afn: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 0c2e6f82c9115..2dff6e21f8a17 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -6811,11 +6811,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v0 -; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp10_v2f16_fast: @@ -6998,13 +6997,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0 -; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1 ; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp10_v3f16_afn: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index c34113a5dfab0..e6e518d13b5f7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -3335,9 +3335,8 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_exp2_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp2_v2f16: @@ -3413,9 +3412,8 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, |v0| -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16: @@ -3497,9 +3495,8 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -|v0| -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16: @@ -3582,9 +3579,8 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16: @@ -3656,9 +3652,8 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_exp2_v2f16_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp2_v2f16_fast: @@ -3738,10 +3733,9 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { ; GFX900-SDAG-LABEL: v_exp_v3f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp_v3f16: @@ -3822,10 +3816,9 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) { ; GFX900-SDAG-LABEL: v_exp2_v3f16_afn: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_exp2_v3f16_afn: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index f44faf4f7edba..c7dcbcfde6d89 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -411,9 +411,9 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v0 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v2, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX9-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16 @@ -522,13 +522,14 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 -; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 -; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32: @@ -628,9 +629,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v0, v0 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract: @@ -698,9 +698,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v1, v0 -; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 0e66b0af99f34..fe05fdf1226ec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -461,10 +461,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fff ; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v3 -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32: @@ -585,9 +584,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16: @@ -697,12 +695,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7fff ; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v5 -; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5 ; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v5 -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5 ; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32: @@ -844,10 +841,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v4 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16: @@ -973,15 +969,13 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x7fff ; GFX9-SDAG-NEXT: v_med3_i32 v5, v5, s4, v6 -; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6 ; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v6 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v6 -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4 -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v5 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32: @@ -1157,12 +1151,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3 -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v5 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v4 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 1dd6a7926029e..bb23fdd26402d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -6870,15 +6870,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) { ; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log_v2f16: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v1, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log_v2f16: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log_v2f16: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7005,22 +7015,22 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0| -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log_fabs_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16: @@ -7157,22 +7167,22 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log_fneg_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0| -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16: @@ -7310,22 +7320,22 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log_fneg_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log_fneg_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16: @@ -7449,15 +7459,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) { ; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log_v2f16_fast: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v1, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log_v2f16_fast: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log_v2f16_fast: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7576,17 +7596,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) { ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log_v3f16: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v2, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_log_f16_e32 v1, v1 -; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log_v3f16: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log_v3f16: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c +; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7715,17 +7747,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) { ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log_v3f16_fast: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v2, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_log_f16_e32 v1, v1 -; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log_v3f16_fast: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log_v3f16_fast: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c +; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7886,31 +7930,29 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) { ; GFX900-SDAG-LABEL: v_log_v4f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log_v4f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16: @@ -8089,31 +8131,29 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; GFX900-SDAG-LABEL: v_log_v4f16_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log_v4f16_fast: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 86a58d26c6ae5..dcf789e26de54 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -6870,15 +6870,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) { ; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log10_v2f16: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v1, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log10_v2f16: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log10_v2f16: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7005,22 +7015,22 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log10_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0| -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log10_fabs_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16: @@ -7157,22 +7167,22 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log10_fneg_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0| -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16: @@ -7310,22 +7320,22 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log10_fneg_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log10_fneg_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16: @@ -7449,15 +7459,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) { ; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log10_v2f16_fast: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v1, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log10_v2f16_fast: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log10_v2f16_fast: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7576,17 +7596,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) { ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log10_v3f16: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v2, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_log_f16_e32 v1, v1 -; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log10_v3f16: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log10_v3f16: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1 +; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7715,17 +7747,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) { ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_log10_v3f16_fast: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_log_f16_e32 v2, v0 -; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-NEXT: v_log_f16_e32 v1, v1 -; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2 -; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-SDAG-LABEL: v_log10_v3f16_fast: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_log10_v3f16_fast: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1 +; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast: ; GFX1100-SDAG-TRUE16: ; %bb.0: @@ -7886,31 +7930,29 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) { ; GFX900-SDAG-LABEL: v_log10_v4f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log10_v4f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16: @@ -8089,31 +8131,29 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; GFX900-SDAG-LABEL: v_log10_v4f16_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3 -; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log10_v4f16_fast: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3 -; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index ea88f77f98735..5544fd764e841 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -4249,17 +4249,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16: @@ -4367,18 +4365,16 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e64 v0, |v0| -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_fabs_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16: @@ -4494,18 +4490,16 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -|v0| -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16: @@ -4622,18 +4616,16 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_fneg_v2f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_fneg_v2f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16: @@ -4739,17 +4731,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_v2f16_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_v2f16_fast: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast: @@ -4861,19 +4851,17 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_v3f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_v3f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16: @@ -4989,19 +4977,17 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_v3f16_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_v3f16_fast: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast: @@ -5129,23 +5115,19 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_v4f16: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_v4f16: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16: @@ -5284,23 +5266,19 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) { ; GFX900-SDAG-LABEL: v_log2_v4f16_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0 -; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-GISEL-LABEL: v_log2_v4f16_fast: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1 -; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index de12f2b246f57..c6fee73f4580d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -198,9 +198,8 @@ define amdgpu_kernel void @rint_v2f16( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rndne_f16_e32 v1, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 1a426096da197..92e8dce75222a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -188,11 +188,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 -; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_sin_f16_e32 v2, v3 -; GFX9-NEXT: v_sin_f16_e32 v1, v1 -; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -204,11 +203,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 -; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_sin_f16_e32 v2, v3 -; GFX10-NEXT: v_sin_f16_e32 v1, v1 -; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir new file mode 100644 index 0000000000000..e45d7dd8f2029 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir @@ -0,0 +1,214 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-peephole-sdwa,dead-mi-elimination -o - %s | FileCheck -check-prefixes=GFX9 %s + +--- +name: symmetric_equal_edges_fp16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9-LABEL: name: symmetric_equal_edges_fp16 + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0) + ; GFX9-NEXT: [[V_SIN_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: [[V_SIN_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0) + ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa1]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec + %21:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec + %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec + %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %26 + SI_RETURN implicit $vgpr0 +... + +--- +name: asymmetric_equal_edges_fp16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9-LABEL: name: asymmetric_equal_edges_fp16 + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0) + ; GFX9-NEXT: [[V_COS_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: [[V_EXP_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0) + ; GFX9-NEXT: $vgpr0 = COPY [[V_EXP_F16_sdwa]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec + %21:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec + %25:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec + %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %26 + SI_RETURN implicit $vgpr0 +... + +--- +name: asymmetric_unequal_edges_fp16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9-LABEL: name: asymmetric_unequal_edges_fp16 + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732 + ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0) + ; GFX9-NEXT: [[V_EXP_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_EXP_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_EXP_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %12:sreg_32 = S_MOV_B32 14732 + %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %24:vgpr_32 = nofpexcept V_EXP_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec + %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec + %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %26 + SI_RETURN implicit $vgpr0 +... + +--- +name: symmetric_one_edge_fp16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9-LABEL: name: symmetric_one_edge_fp16 + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0) + ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0) + ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa1]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %14:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec + %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec + %21:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %16, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %21 + SI_RETURN implicit $vgpr0 +... + +--- +name: symmetric_equal_edges_fp16_fp32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX9-LABEL: name: symmetric_equal_edges_fp16_fp32 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811 + ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_EXP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %12:sgpr_32 = S_MOV_B32 1069066811 + %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec + %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec + %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec + %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec + %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %30:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %28, 0, 0, implicit $mode, implicit $exec + %31:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %30, 0, %12, 0, 0, implicit $mode, implicit $exec + %32:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec + %33:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %32, 0, 0, implicit $mode, implicit $exec + %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %33, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %34 + SI_RETURN implicit $vgpr0 +... + +--- +name: asymmetric_unequal_edges_fp16_fp32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX9-LABEL: name: asymmetric_unequal_edges_fp16_fp32 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811 + ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, 6, 0, 5, 6, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_EXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed [[V_MUL_F16_sdwa]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_EXP_F16_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %12:sgpr_32 = S_MOV_B32 1069066811 + %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec + %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec + %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec + %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec + %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %31:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %28, 0, %12, 0, 0, implicit $mode, implicit $exec + %32:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec + %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %32, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %34 + SI_RETURN implicit $vgpr0 +... + +--- +name: interleaved_symmetric_edges_fp16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9-LABEL: name: interleaved_symmetric_edges_fp16 + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_SIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]] + ; GFX9-NEXT: SI_RETURN implicit $vgpr0 + %8:vgpr_32 = COPY $vgpr0 + %11:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec + %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec + %26:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %18, 0, 12568, 0, 0, implicit $mode, implicit $exec + %27:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec + %28:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %13, 0, 12568, 0, 0, implicit $mode, implicit $exec + %29:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %28, 0, 0, implicit $mode, implicit $exec + %30:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %29, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %30 + SI_RETURN implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll index 04eea20993608..9f31bde8086d0 100644 --- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll +++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll @@ -829,9 +829,8 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x ; GFX9-LABEL: v_repeat_divisor_v2f16_x2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_rcp_f16_e32 v2, v2 -; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3 +; GFX9-NEXT: v_rcp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_rcp_f16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -932,15 +931,14 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x ; GFX9-LABEL: v_repeat_divisor_v3f16_x2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_rcp_f16_e32 v4, v4 +; GFX9-NEXT: v_rcp_f16_sdwa v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f16_e32 v5, v5 ; GFX9-NEXT: s_movk_i32 s4, 0x7e00 -; GFX9-NEXT: v_pack_b32_f16 v4, v4, v6 +; GFX9-NEXT: v_rcp_f16_sdwa v4, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GFX9-NEXT: v_pack_b32_f16 v5, v5, s4 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v5 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 ; GFX9-NEXT: v_pk_mul_f16 v4, v2, v4 ; GFX9-NEXT: v_alignbit_b32 v2, v3, v4, 16 ; GFX9-NEXT: v_pack_b32_f16 v1, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 59a1fe041bf90..97358044abdaa 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -460,17 +460,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { ; GFX9-LABEL: v_roundeven_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rndne_f16_e32 v1, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rndne_f16_e32 v1, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_roundeven_v2f16: @@ -523,17 +521,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { ; SDAG_GFX9-LABEL: v_roundeven_v2f16: ; SDAG_GFX9: ; %bb.0: ; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0 -; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX10-LABEL: v_roundeven_v2f16: ; SDAG_GFX10: ; %bb.0: ; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0 -; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16: @@ -602,18 +598,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX9-NEXT: v_rndne_f16_e32 v1, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_v2f16_fneg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_rndne_f16_e32 v1, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg: @@ -676,17 +670,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { ; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg: ; SDAG_GFX9: ; %bb.0: ; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX9-NEXT: v_rndne_f16_e64 v0, -v0 -; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg: ; SDAG_GFX10: ; %bb.0: ; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX10-NEXT: v_rndne_f16_e64 v0, -v0 -; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg: @@ -759,23 +751,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; GFX9-LABEL: v_roundeven_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rndne_f16_e32 v2, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_rndne_f16_e32 v3, v1 -; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rndne_f16_e32 v2, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_rndne_f16_e32 v3, v1 -; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_roundeven_v4f16: @@ -850,23 +838,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; SDAG_GFX9-LABEL: v_roundeven_v4f16: ; SDAG_GFX9: ; %bb.0: ; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX9-NEXT: v_rndne_f16_e32 v1, v1 -; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0 -; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v3 -; SDAG_GFX9-NEXT: v_pack_b32_f16 v1, v1, v2 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX10-LABEL: v_roundeven_v4f16: ; SDAG_GFX10: ; %bb.0: ; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0 -; SDAG_GFX10-NEXT: v_rndne_f16_e32 v1, v1 -; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v3 -; SDAG_GFX10-NEXT: v_pack_b32_f16 v1, v1, v2 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16: