diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index a8e4ea9429f50..d2a69a437a578 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -519,6 +519,9 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +void initializeAMDGPUMarkSGPRHazardRegsLegacyPass(PassRegistry &); +extern char &AMDGPUMarkSGPRHazardRegsLegacyID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp new file mode 100644 index 0000000000000..46dfcbb48e54f --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp @@ -0,0 +1,102 @@ +//===- AMDGPUMarkSGPRHazardRegs.cpp - Annotate SGPRs used by VALU ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to mark SGPRs used by VALU. +/// Marks can be used during register allocation to reduce hazards. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMarkSGPRHazardRegs.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-mark-sgpr-hazard-regs" + +namespace { + +class AMDGPUMarkSGPRHazardRegs { +public: + AMDGPUMarkSGPRHazardRegs() {} + bool run(MachineFunction &MF); +}; + +class AMDGPUMarkSGPRHazardRegsLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUMarkSGPRHazardRegsLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + return AMDGPUMarkSGPRHazardRegs().run(MF); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +bool AMDGPUMarkSGPRHazardRegs::run(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVALUReadSGPRHazard()) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!TRI->getSGPRHazardAvoidanceStrategy(MF)) + return false; + + LLVM_DEBUG(dbgs() << "AMDGPUMarkSGPRHazardRegs: function " << MF.getName() + << "\n"); + + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + + for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (MRI->reg_nodbg_empty(Reg)) + continue; + const auto *RC = MRI->getRegClass(Reg); + if (!RC || !TRI->isSGPRClass(RC)) + continue; + for (const auto &MO : MRI->reg_nodbg_operands(Reg)) { + const MachineInstr &MI = *MO.getParent(); + if (SIInstrInfo::isVALU(MI) && MO.isUse()) { + FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG); + break; + } + } + } + + return true; +} + +INITIALIZE_PASS(AMDGPUMarkSGPRHazardRegsLegacy, DEBUG_TYPE, + "AMDGPU Mark Hazard SGPRs", false, false) + +char AMDGPUMarkSGPRHazardRegsLegacy::ID = 0; + +char &llvm::AMDGPUMarkSGPRHazardRegsLegacyID = + AMDGPUMarkSGPRHazardRegsLegacy::ID; + +PreservedAnalyses +AMDGPUMarkSGPRHazardRegsPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + AMDGPUMarkSGPRHazardRegs().run(MF); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h new file mode 100644 index 0000000000000..89905ceb1185d --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h @@ -0,0 +1,25 @@ +//===--- AMDGPUMarkSGPRHazardRegs.h -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class AMDGPUMarkSGPRHazardRegsPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4937b434bc955..ca4bcaeb610dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -23,6 +23,7 @@ #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPUMacroFusion.h" +#include "AMDGPUMarkSGPRHazardRegs.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUReserveWWMRegs.h" @@ -561,6 +562,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); + initializeAMDGPUMarkSGPRHazardRegsLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1616,6 +1618,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { addPass(&GCNPreRALongBranchRegID); + addPass(&AMDGPUMarkSGPRHazardRegsLegacyID); addPass(createSGPRAllocPass(true)); // Commit allocated register changes. This is mostly necessary because too diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..9d94de0a741a4 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -84,6 +84,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUIGroupLP.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp + AMDGPUMarkSGPRHazardRegs.cpp AMDGPUMIRFormatter.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 8e811b43a4532..a60b26d9cbb0c 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1067,6 +1067,7 @@ namespace VirtRegFlag { enum Register_Flag : uint8_t { // Register operand in a whole-wave mode operation. WWM_REG = 1 << 0, + SGPR_HAZARD_REG = 1 << 1 }; } // namespace VirtRegFlag diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index c1ac9491b2363..47d94d1471775 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -11,14 +11,15 @@ // //===----------------------------------------------------------------------===// +#include "SIRegisterInfo.h" #include "AMDGPU.h" #include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -35,6 +36,10 @@ static cl::opt EnableSpillSGPRToVGPR( cl::ReallyHidden, cl::init(true)); +static cl::opt SGPRHazardAvoidanceStrategy( + "amdgpu-sgpr-hazard-regalloc", cl::init(0), cl::ReallyHidden, + cl::desc("Register allocation strategy to reduce SGPR read hazards")); + std::array, 32> SIRegisterInfo::RegSplitParts; std::array, 9> SIRegisterInfo::SubRegFromChannelTable; @@ -3781,9 +3786,152 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg, return false; } default: - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, - VRM); + break; + } + + bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( + VirtReg, Order, Hints, MF, VRM, Matrix); + if (!VRM) + return BaseImplRetVal; + + // Only use hinting to reduce SGPR read hazards when required. + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVALUReadSGPRHazard()) + return BaseImplRetVal; + + // Only treat SGPRs + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const auto *RC = MRI.getRegClass(VirtReg); + if (!isSGPRClass(RC)) + return BaseImplRetVal; + + const unsigned Strategy = getSGPRHazardAvoidanceStrategy(MF); + if (!Strategy) + return BaseImplRetVal; + + SmallSet CopyHints; + CopyHints.insert(Hints.begin(), Hints.end()); + + auto AddHint = [&](MCPhysReg PhysReg) { + if (CopyHints.contains(PhysReg) || MRI.isReserved(PhysReg)) + return; + Hints.push_back(PhysReg); + }; + auto AddHints = [&](ArrayRef Regs) { + for (MCPhysReg PhysReg : Regs) + AddHint(PhysReg); + }; + + // V1: simply reverse allocation order, mean 23% reduction in hazards + if (Strategy == 1) { + if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) { + for (MCPhysReg PhysReg : reverse(Order)) + AddHint(PhysReg); + } else { + for (MCPhysReg PhysReg : Order) + AddHint(PhysReg); + } + return true; + } + + // Build set of current hazard pairs from live matrix + auto *LiveUnions = const_cast(Matrix)->getLiveUnions(); + + DenseMap IntervalCount; + std::bitset<64> HazardPairs; + + for (MCPhysReg PhysReg : Order) { + SmallSet Intervals; + bool IsHazard = false; + for (auto Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion &LIU = LiveUnions[Unit]; + for (const LiveInterval *LI : LIU.getMap()) { + Intervals.insert(LI); + if (FuncInfo->checkFlag(LI->reg(), + AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) { + IsHazard = true; + // Break here as we only care about interval count for non-hazard regs + break; + } + } + if (IsHazard) + break; + } + if (IsHazard) { + unsigned PairN = TRI->getEncodingValue(PhysReg) >> 1; + if (PairN <= 63) + HazardPairs.set(PairN); + } + IntervalCount[PhysReg] = Intervals.size(); + } + + // V2: weight the entire order based on hazard free usage, mean 30% reduction + // in hazards + if (Strategy == 2) { + bool VRegIsHazard = + FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG); + SmallVector NewOrder(Order); + std::sort(NewOrder.begin(), NewOrder.end(), [&](MCPhysReg A, MCPhysReg B) { + return VRegIsHazard ? IntervalCount[A] < IntervalCount[B] + : IntervalCount[B] < IntervalCount[A]; + }); + AddHints(NewOrder); + return true; + } + + // V3: complex partitioning, mean 35% reduction in hazards + assert(Strategy == 3); + + // Partition the allocation order based on hazards + SmallVector Unallocated, UnallocatedWithHazard; + SmallVector Allocated, AllocatedWithHazard; + + for (MCPhysReg PhysReg : Order) { + Register VReg = Matrix->getOneVReg(PhysReg); + bool HasHazard = false; + // XXX: can remove regunit scan for just SGPR32/SGPR64 + for (auto Unit : TRI->regunits(PhysReg)) { + unsigned PairN = TRI->getEncodingValue(Unit) >> 1; + if (PairN <= 63 && HazardPairs[PairN]) { + HasHazard = true; + break; + } + } + if (VReg == MCRegister::NoRegister) { + if (HasHazard) + UnallocatedWithHazard.push_back(PhysReg); + else + Unallocated.push_back(PhysReg); + } else { + if (HasHazard) + AllocatedWithHazard.push_back(PhysReg); + else + Allocated.push_back(PhysReg); + } } + + if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) { + // Reorder allocations based on usage, so least used will be reused first. + // This means least used regs are touched by hazards first. + std::sort(Allocated.begin(), Allocated.end(), + [&](MCPhysReg A, MCPhysReg B) { + return IntervalCount[A] < IntervalCount[B]; + }); + // Reverse order of allocations to try to keep hazards away - yes it helps. + std::reverse(Unallocated.begin(), Unallocated.end()); + + AddHints(AllocatedWithHazard); + AddHints(UnallocatedWithHazard); + AddHints(Unallocated); + AddHints(Allocated); + } else { + AddHints(Allocated); + AddHints(Unallocated); + AddHints(UnallocatedWithHazard); + AddHints(AllocatedWithHazard); + } + + return true; } MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { @@ -4005,3 +4153,11 @@ SIRegisterInfo::getVRegFlagsOfReg(Register Reg, RegFlags.push_back("WWM_REG"); return RegFlags; } + +unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy( + const MachineFunction &MF) const { + if (SGPRHazardAvoidanceStrategy.getNumOccurrences()) + return SGPRHazardAvoidanceStrategy; + return MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-sgpr-hazard-regalloc", 0); +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index f3068963fd10f..13b6748e7616c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -477,6 +477,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { SmallVector getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override; + + unsigned getSGPRHazardAvoidanceStrategy(const MachineFunction &MF) const; }; namespace AMDGPU { diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 4b6cc32522f5b..41e481c272971 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -357,6 +357,7 @@ ; GCN-O1-NEXT: SI Whole Quad Mode ; GCN-O1-NEXT: SI optimize exec mask operations pre-RA ; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg +; GCN-O1-NEXT: AMDGPU Mark Hazard SGPRs ; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: Machine Block Frequency Analysis ; GCN-O1-NEXT: Debug Variable Analysis @@ -670,6 +671,7 @@ ; GCN-O1-OPTS-NEXT: SI Whole Quad Mode ; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA ; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg +; GCN-O1-OPTS-NEXT: AMDGPU Mark Hazard SGPRs ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Debug Variable Analysis @@ -989,6 +991,7 @@ ; GCN-O2-NEXT: SI optimize exec mask operations pre-RA ; GCN-O2-NEXT: SI Form memory clauses ; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg +; GCN-O2-NEXT: AMDGPU Mark Hazard SGPRs ; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: Machine Block Frequency Analysis ; GCN-O2-NEXT: Debug Variable Analysis @@ -1321,6 +1324,7 @@ ; GCN-O3-NEXT: SI optimize exec mask operations pre-RA ; GCN-O3-NEXT: SI Form memory clauses ; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg +; GCN-O3-NEXT: AMDGPU Mark Hazard SGPRs ; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: Machine Block Frequency Analysis ; GCN-O3-NEXT: Debug Variable Analysis diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll new file mode 100644 index 0000000000000..36105f64f11b5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=0 < %s | FileCheck -check-prefix DEF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=1 < %s | FileCheck -check-prefix V1 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=2 < %s | FileCheck -check-prefix V2 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=3 < %s | FileCheck -check-prefix V3 %s + +define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out, <4 x i32> inreg %desc) { +; DEF-LABEL: fadd_f32: +; DEF: ; %bb.0: ; %entry +; DEF-NEXT: s_mov_b32 s6, s4 +; DEF-NEXT: s_mov_b32 s4, s2 +; DEF-NEXT: s_add_f32 s2, s0, s1 +; DEF-NEXT: s_sub_f32 s1, s0, s1 +; DEF-NEXT: s_mov_b32 s7, s5 +; DEF-NEXT: s_mov_b32 s5, s3 +; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; DEF-NEXT: v_dual_add_f32 v0, s2, v0 :: v_dual_add_f32 v1, s1, v1 +; DEF-NEXT: v_readfirstlane_b32 s0, v0 +; DEF-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DEF-NEXT: v_readfirstlane_b32 s3, v1 +; DEF-NEXT: v_mul_f32_e32 v4, v0, v1 +; DEF-NEXT: s_and_b32 s0, s0, s3 +; DEF-NEXT: global_store_b32 v[2:3], v4, off +; DEF-NEXT: s_wait_alu 0xfffe +; DEF-NEXT: s_cmp_lg_u32 s0, 0 +; DEF-NEXT: s_mov_b32 s0, 0 +; DEF-NEXT: s_cbranch_scc0 .LBB0_5 +; DEF-NEXT: ; %bb.1: ; %false +; DEF-NEXT: s_buffer_load_b32 s3, s[4:7], 0x0 +; DEF-NEXT: s_and_b32 s1, s2, s1 +; DEF-NEXT: v_add_f32_e32 v0, v0, v1 +; DEF-NEXT: s_mov_b32 s8, exec_lo +; DEF-NEXT: s_wait_kmcnt 0x0 +; DEF-NEXT: s_wait_alu 0xfffe +; DEF-NEXT: s_lshl_b32 s1, s3, s1 +; DEF-NEXT: s_wait_alu 0xfffe +; DEF-NEXT: v_cmp_ne_u32_e32 vcc_lo, s1, v1 +; DEF-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo +; DEF-NEXT: s_wait_alu 0xfffe +; DEF-NEXT: s_and_not1_b32 s8, s8, s1 +; DEF-NEXT: s_cbranch_scc0 .LBB0_6 +; DEF-NEXT: ; %bb.2: ; %false +; DEF-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DEF-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; DEF-NEXT: s_cbranch_vccnz .LBB0_4 +; DEF-NEXT: .LBB0_3: ; %true +; DEF-NEXT: v_mul_f32_e32 v0, v1, v4 +; DEF-NEXT: .LBB0_4: ; %final +; DEF-NEXT: s_branch .LBB0_7 +; DEF-NEXT: .LBB0_5: +; DEF-NEXT: ; implicit-def: $vgpr0 +; DEF-NEXT: s_branch .LBB0_3 +; DEF-NEXT: .LBB0_6: +; DEF-NEXT: s_mov_b32 exec_lo, 0 +; DEF-NEXT: export mrt0 off, off, off, off done +; DEF-NEXT: s_endpgm +; DEF-NEXT: .LBB0_7: +; +; V1-LABEL: fadd_f32: +; V1: ; %bb.0: ; %entry +; V1-NEXT: s_add_f32 s104, s0, s1 +; V1-NEXT: s_sub_f32 s103, s0, s1 +; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; V1-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s103, v1 +; V1-NEXT: v_readfirstlane_b32 s0, v0 +; V1-NEXT: s_delay_alu instid0(VALU_DEP_2) +; V1-NEXT: v_readfirstlane_b32 s1, v1 +; V1-NEXT: v_mul_f32_e32 v4, v0, v1 +; V1-NEXT: s_and_b32 s0, s0, s1 +; V1-NEXT: global_store_b32 v[2:3], v4, off +; V1-NEXT: s_cmp_lg_u32 s0, 0 +; V1-NEXT: s_mov_b32 s0, 0 +; V1-NEXT: s_cbranch_scc0 .LBB0_5 +; V1-NEXT: ; %bb.1: ; %false +; V1-NEXT: s_mov_b32 s7, s5 +; V1-NEXT: s_mov_b32 s6, s4 +; V1-NEXT: s_mov_b32 s5, s3 +; V1-NEXT: s_mov_b32 s4, s2 +; V1-NEXT: s_and_b32 s2, s104, s103 +; V1-NEXT: s_buffer_load_b32 s1, s[4:7], 0x0 +; V1-NEXT: v_add_f32_e32 v0, v0, v1 +; V1-NEXT: s_mov_b32 s8, exec_lo +; V1-NEXT: s_wait_kmcnt 0x0 +; V1-NEXT: s_lshl_b32 vcc_hi, s1, s2 +; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; V1-NEXT: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v1 +; V1-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo +; V1-NEXT: s_and_not1_b32 s8, s8, s1 +; V1-NEXT: s_cbranch_scc0 .LBB0_6 +; V1-NEXT: ; %bb.2: ; %false +; V1-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; V1-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; V1-NEXT: s_wait_alu 0xfffe +; V1-NEXT: s_cbranch_vccnz .LBB0_4 +; V1-NEXT: .LBB0_3: ; %true +; V1-NEXT: v_mul_f32_e32 v0, v1, v4 +; V1-NEXT: .LBB0_4: ; %final +; V1-NEXT: s_branch .LBB0_7 +; V1-NEXT: .LBB0_5: +; V1-NEXT: ; implicit-def: $vgpr0 +; V1-NEXT: s_branch .LBB0_3 +; V1-NEXT: .LBB0_6: +; V1-NEXT: s_mov_b32 exec_lo, 0 +; V1-NEXT: export mrt0 off, off, off, off done +; V1-NEXT: s_endpgm +; V1-NEXT: .LBB0_7: +; +; V2-LABEL: fadd_f32: +; V2: ; %bb.0: ; %entry +; V2-NEXT: s_add_f32 s62, s0, s1 +; V2-NEXT: s_sub_f32 s61, s0, s1 +; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; V2-NEXT: v_dual_add_f32 v0, s62, v0 :: v_dual_add_f32 v1, s61, v1 +; V2-NEXT: v_readfirstlane_b32 s1, v0 +; V2-NEXT: s_delay_alu instid0(VALU_DEP_2) +; V2-NEXT: v_readfirstlane_b32 vcc_lo, v1 +; V2-NEXT: v_mul_f32_e32 v4, v0, v1 +; V2-NEXT: s_and_b32 s1, s1, vcc_lo +; V2-NEXT: global_store_b32 v[2:3], v4, off +; V2-NEXT: s_cmp_lg_u32 s1, 0 +; V2-NEXT: s_mov_b32 s1, 0 +; V2-NEXT: s_cbranch_scc0 .LBB0_5 +; V2-NEXT: ; %bb.1: ; %false +; V2-NEXT: s_mov_b32 s55, s5 +; V2-NEXT: s_mov_b32 s54, s4 +; V2-NEXT: s_mov_b32 s53, s3 +; V2-NEXT: s_mov_b32 s52, s2 +; V2-NEXT: v_add_f32_e32 v0, v0, v1 +; V2-NEXT: s_buffer_load_b32 vcc_lo, s[52:55], 0x0 +; V2-NEXT: s_and_b32 s54, s62, s61 +; V2-NEXT: s_mov_b32 s69, exec_lo +; V2-NEXT: s_wait_kmcnt 0x0 +; V2-NEXT: s_lshl_b32 s67, vcc_lo, s54 +; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; V2-NEXT: v_cmp_ne_u32_e32 vcc_lo, s67, v1 +; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; V2-NEXT: s_and_not1_b32 s69, s69, vcc_lo +; V2-NEXT: s_cbranch_scc0 .LBB0_6 +; V2-NEXT: ; %bb.2: ; %false +; V2-NEXT: s_and_b32 exec_lo, exec_lo, s69 +; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; V2-NEXT: s_cbranch_vccnz .LBB0_4 +; V2-NEXT: .LBB0_3: ; %true +; V2-NEXT: v_mul_f32_e32 v0, v1, v4 +; V2-NEXT: .LBB0_4: ; %final +; V2-NEXT: s_branch .LBB0_7 +; V2-NEXT: .LBB0_5: +; V2-NEXT: ; implicit-def: $vgpr0 +; V2-NEXT: s_branch .LBB0_3 +; V2-NEXT: .LBB0_6: +; V2-NEXT: s_mov_b32 exec_lo, 0 +; V2-NEXT: export mrt0 off, off, off, off done +; V2-NEXT: s_endpgm +; V2-NEXT: .LBB0_7: +; +; V3-LABEL: fadd_f32: +; V3: ; %bb.0: ; %entry +; V3-NEXT: s_add_f32 s104, s0, s1 +; V3-NEXT: s_sub_f32 s82, s0, s1 +; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; V3-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s82, v1 +; V3-NEXT: v_readfirstlane_b32 s0, v0 +; V3-NEXT: s_delay_alu instid0(VALU_DEP_2) +; V3-NEXT: v_readfirstlane_b32 s1, v1 +; V3-NEXT: v_mul_f32_e32 v4, v0, v1 +; V3-NEXT: s_and_b32 s0, s0, s1 +; V3-NEXT: global_store_b32 v[2:3], v4, off +; V3-NEXT: s_cmp_lg_u32 s0, 0 +; V3-NEXT: s_mov_b32 s0, 0 +; V3-NEXT: s_cbranch_scc0 .LBB0_5 +; V3-NEXT: ; %bb.1: ; %false +; V3-NEXT: s_mov_b32 s7, s5 +; V3-NEXT: s_mov_b32 s6, s4 +; V3-NEXT: s_mov_b32 s5, s3 +; V3-NEXT: s_mov_b32 s4, s2 +; V3-NEXT: v_add_f32_e32 v0, v0, v1 +; V3-NEXT: s_buffer_load_b32 s1, s[4:7], 0x0 +; V3-NEXT: s_and_b32 s4, s104, s82 +; V3-NEXT: s_mov_b32 s8, exec_lo +; V3-NEXT: s_wait_kmcnt 0x0 +; V3-NEXT: s_lshl_b32 s82, s1, s4 +; V3-NEXT: s_wait_alu 0xfffe +; V3-NEXT: v_cmp_ne_u32_e32 vcc_lo, s82, v1 +; V3-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo +; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; V3-NEXT: s_and_not1_b32 s8, s8, s1 +; V3-NEXT: s_cbranch_scc0 .LBB0_6 +; V3-NEXT: ; %bb.2: ; %false +; V3-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; V3-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; V3-NEXT: s_cbranch_vccnz .LBB0_4 +; V3-NEXT: .LBB0_3: ; %true +; V3-NEXT: v_mul_f32_e32 v0, v1, v4 +; V3-NEXT: .LBB0_4: ; %final +; V3-NEXT: s_branch .LBB0_7 +; V3-NEXT: .LBB0_5: +; V3-NEXT: ; implicit-def: $vgpr0 +; V3-NEXT: s_branch .LBB0_3 +; V3-NEXT: .LBB0_6: +; V3-NEXT: s_mov_b32 exec_lo, 0 +; V3-NEXT: export mrt0 off, off, off, off done +; V3-NEXT: s_endpgm +; V3-NEXT: .LBB0_7: +entry: + %s.0 = fadd float %a, %b + %s.1 = fsub float %a, %b + %v.0 = fadd float %c, %s.0 + %v.1 = fadd float %d, %s.1 + %v.2 = fmul float %v.0, %v.1 + store float %v.2, ptr addrspace(1) %out + %tmp.0 = bitcast float %v.0 to i32 + %tmp.1 = bitcast float %v.1 to i32 + %tmp.2 = bitcast float %s.0 to i32 + %tmp.3 = bitcast float %s.1 to i32 + %s.3 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %tmp.0) + %s.4 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %tmp.1) + %s.5 = and i32 %s.3, %s.4 + %s.6 = and i32 %tmp.2, %tmp.3 + %c.0 = icmp eq i32 %s.5, 0 + br i1 %c.0, label %true, label %false +true: + %v.3 = fmul float %v.1, %v.2 + br label %final +false: + %v.4 = fadd float %v.0, %v.1 + %s.7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %s.8 = shl i32 %s.7, %s.6 + %c.1 = icmp ne i32 %tmp.1, %s.8 + call void @llvm.amdgcn.wqm.demote(i1 %c.1) + br label %final +final: + %res = phi float [ %v.4, %false ], [ %v.3, %true ] + ret float %res +} + +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) +declare void @llvm.amdgcn.wqm.demote(i1)