-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Add option to bias SGPR allocation to reduce read hazards #129869
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
perlfu
commented
Mar 5, 2025
- Scan for potential hazards in virtual registers before SGPR allocation.
- Use this data to build new allocation order via allocation hints.
@llvm/pr-subscribers-backend-amdgpu Author: Carl Ritson (perlfu) Changes
Patch is 25.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129869.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..ac3d683ffeea2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -478,6 +478,9 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUMarkSGPRHazardRegsLegacyPass(PassRegistry &);
+extern char &AMDGPUMarkSGPRHazardRegsLegacyID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
new file mode 100644
index 0000000000000..46dfcbb48e54f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
@@ -0,0 +1,102 @@
+//===- AMDGPUMarkSGPRHazardRegs.cpp - Annotate SGPRs used by VALU ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to mark SGPRs used by VALU.
+/// Marks can be used during register allocation to reduce hazards.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMarkSGPRHazardRegs.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mark-sgpr-hazard-regs"
+
+namespace {
+
+class AMDGPUMarkSGPRHazardRegs {
+public:
+ AMDGPUMarkSGPRHazardRegs() {}
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUMarkSGPRHazardRegsLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUMarkSGPRHazardRegsLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ return AMDGPUMarkSGPRHazardRegs().run(MF);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+bool AMDGPUMarkSGPRHazardRegs::run(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVALUReadSGPRHazard())
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (!TRI->getSGPRHazardAvoidanceStrategy(MF))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUMarkSGPRHazardRegs: function " << MF.getName()
+ << "\n");
+
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+ for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (MRI->reg_nodbg_empty(Reg))
+ continue;
+ const auto *RC = MRI->getRegClass(Reg);
+ if (!RC || !TRI->isSGPRClass(RC))
+ continue;
+ for (const auto &MO : MRI->reg_nodbg_operands(Reg)) {
+ const MachineInstr &MI = *MO.getParent();
+ if (SIInstrInfo::isVALU(MI) && MO.isUse()) {
+ FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ break;
+ }
+ }
+ }
+
+ return true;
+}
+
+INITIALIZE_PASS(AMDGPUMarkSGPRHazardRegsLegacy, DEBUG_TYPE,
+ "AMDGPU Mark Hazard SGPRs", false, false)
+
+char AMDGPUMarkSGPRHazardRegsLegacy::ID = 0;
+
+char &llvm::AMDGPUMarkSGPRHazardRegsLegacyID =
+ AMDGPUMarkSGPRHazardRegsLegacy::ID;
+
+PreservedAnalyses
+AMDGPUMarkSGPRHazardRegsPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ AMDGPUMarkSGPRHazardRegs().run(MF);
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
new file mode 100644
index 0000000000000..89905ceb1185d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
@@ -0,0 +1,25 @@
+//===--- AMDGPUMarkSGPRHazardRegs.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUMarkSGPRHazardRegsPass
+ : public PassInfoMixin<AMDGPUMarkSGPRHazardRegsPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..889006b1086c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPUMacroFusion.h"
+#include "AMDGPUMarkSGPRHazardRegs.h"
#include "AMDGPUOpenCLEnqueuedBlockLowering.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
@@ -560,6 +561,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGCNRegPressurePrinterPass(*PR);
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
+ initializeAMDGPUMarkSGPRHazardRegsLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1613,6 +1615,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
addPass(&GCNPreRALongBranchRegID);
+ addPass(&AMDGPUMarkSGPRHazardRegsLegacyID);
addPass(createSGPRAllocPass(true));
// Commit allocated register changes. This is mostly necessary because too
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 408da0536237e..83de8ac1c7535 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -85,6 +85,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
+ AMDGPUMarkSGPRHazardRegs.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 721601efcc804..bf91118ec148a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1066,6 +1066,7 @@ namespace VirtRegFlag {
enum Register_Flag : uint8_t {
// Register operand in a whole-wave mode operation.
WWM_REG = 1 << 0,
+ SGPR_HAZARD_REG = 1 << 1
};
} // namespace VirtRegFlag
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 128cd8244a477..d210d701b9306 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -11,14 +11,15 @@
//
//===----------------------------------------------------------------------===//
+#include "SIRegisterInfo.h"
#include "AMDGPU.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35,6 +36,10 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<unsigned> SGPRHazardAvoidanceStrategy(
+ "amdgpu-sgpr-hazard-regalloc", cl::init(0), cl::ReallyHidden,
+ cl::desc("Register allocation strategy to reduce SGPR read hazards"));
+
std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
@@ -3904,3 +3909,166 @@ SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
RegFlags.push_back("WWM_REG");
return RegFlags;
}
+
+unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy(
+ const MachineFunction &MF) const {
+ if (SGPRHazardAvoidanceStrategy.getNumOccurrences()) {
+ return SGPRHazardAvoidanceStrategy;
+ } else {
+ return MF.getFunction().getFnAttributeAsParsedInteger(
+ "amdgpu-sgpr-hazard-regalloc", 0);
+ }
+}
+
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+ if (!VRM)
+ return BaseImplRetVal;
+
+ // Only use hinting to reduce SGPR read hazards when required.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVALUReadSGPRHazard())
+ return BaseImplRetVal;
+
+ // Only treat SGPRs
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const auto *RC = MRI->getRegClass(VirtReg);
+ if (!isSGPRClass(RC))
+ return BaseImplRetVal;
+
+ const unsigned Strategy = getSGPRHazardAvoidanceStrategy(MF);
+ if (!Strategy)
+ return BaseImplRetVal;
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ if (CopyHints.contains(PhysReg) || MRI->isReserved(PhysReg))
+ return;
+ Hints.push_back(PhysReg);
+ };
+ auto AddHints = [&](ArrayRef<MCPhysReg> Regs) {
+ for (MCPhysReg PhysReg : Regs)
+ AddHint(PhysReg);
+ };
+
+ // V1: simply reverse allocation order, mean 23% reduction in hazards
+ if (Strategy == 1) {
+ if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ for (MCPhysReg PhysReg : reverse(Order))
+ AddHint(PhysReg);
+ } else {
+ for (MCPhysReg PhysReg : Order)
+ AddHint(PhysReg);
+ }
+ return true;
+ }
+
+ // Build set of current hazard pairs from live matrix
+ auto *LiveUnions = const_cast<LiveRegMatrix *>(Matrix)->getLiveUnions();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+ DenseMap<MCPhysReg, unsigned> IntervalCount;
+ std::bitset<64> HazardPairs;
+
+ for (MCPhysReg PhysReg : Order) {
+ SmallSet<const LiveInterval *, 4> Intervals;
+ bool IsHazard = false;
+ for (auto Unit : TRI->regunits(PhysReg)) {
+ LiveIntervalUnion &LIU = LiveUnions[Unit];
+ for (const LiveInterval *LI : LIU.getMap()) {
+ Intervals.insert(LI);
+ if (FuncInfo->checkFlag(LI->reg(),
+ AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ IsHazard = true;
+ // Break here as we only care about interval count for non-hazard regs
+ break;
+ }
+ }
+ if (IsHazard)
+ break;
+ }
+ if (IsHazard) {
+ unsigned PairN = TRI->getEncodingValue(PhysReg) >> 1;
+ if (PairN <= 63)
+ HazardPairs.set(PairN);
+ }
+ IntervalCount[PhysReg] = Intervals.size();
+ }
+
+ // V2: weight the entire order based on hazard free usage, mean 30% reduction
+ // in hazards
+ if (Strategy == 2) {
+ bool VRegIsHazard =
+ FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ SmallVector<MCPhysReg> NewOrder(Order);
+ std::sort(NewOrder.begin(), NewOrder.end(), [&](MCPhysReg A, MCPhysReg B) {
+ return VRegIsHazard ? IntervalCount[A] < IntervalCount[B]
+ : IntervalCount[B] < IntervalCount[A];
+ });
+ AddHints(NewOrder);
+ return true;
+ }
+
+ // V3: complex partitioning, mean 35% reduction in hazards
+ assert(Strategy == 3);
+
+ // Partition the allocation order based on hazards
+ SmallVector<MCPhysReg> Unallocated, UnallocatedWithHazard;
+ SmallVector<MCPhysReg> Allocated, AllocatedWithHazard;
+
+ for (MCPhysReg PhysReg : Order) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ bool HasHazard = false;
+ // XXX: can remove regunit scan for just SGPR32/SGPR64
+ for (auto Unit : TRI->regunits(PhysReg)) {
+ unsigned PairN = TRI->getEncodingValue(Unit) >> 1;
+ if (PairN <= 63 && HazardPairs[PairN]) {
+ HasHazard = true;
+ break;
+ }
+ }
+ if (VReg == MCRegister::NoRegister) {
+ if (HasHazard)
+ UnallocatedWithHazard.push_back(PhysReg);
+ else
+ Unallocated.push_back(PhysReg);
+ } else {
+ if (HasHazard)
+ AllocatedWithHazard.push_back(PhysReg);
+ else
+ Allocated.push_back(PhysReg);
+ }
+ }
+
+ if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ // Reorder allocations based on usage, so least used will be reused first.
+ // This means least used regs are touched by hazards first.
+ std::sort(Allocated.begin(), Allocated.end(),
+ [&](MCPhysReg A, MCPhysReg B) {
+ return IntervalCount[A] < IntervalCount[B];
+ });
+ // Reverse order of allocations to try to keep hazards away - yes it helps.
+ std::reverse(Unallocated.begin(), Unallocated.end());
+
+ AddHints(AllocatedWithHazard);
+ AddHints(UnallocatedWithHazard);
+ AddHints(Unallocated);
+ AddHints(Allocated);
+ } else {
+ AddHints(Allocated);
+ AddHints(Unallocated);
+ AddHints(UnallocatedWithHazard);
+ AddHints(AllocatedWithHazard);
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a64180daea2ad..5ccede9ca3529 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -465,6 +465,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+ unsigned getSGPRHazardAvoidanceStrategy(const MachineFunction &MF) const;
+
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
};
namespace AMDGPU {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d7f54f3b8e9e2..b933286b8c6ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -357,6 +357,7 @@
; GCN-O1-NEXT: SI Whole Quad Mode
; GCN-O1-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Machine Block Frequency Analysis
; GCN-O1-NEXT: Debug Variable Analysis
@@ -670,6 +671,7 @@
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-OPTS-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Debug Variable Analysis
@@ -989,6 +991,7 @@
; GCN-O2-NEXT: SI optimize exec mask operations pre-RA
; GCN-O2-NEXT: SI Form memory clauses
; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O2-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Machine Block Frequency Analysis
; GCN-O2-NEXT: Debug Variable Analysis
@@ -1321,6 +1324,7 @@
; GCN-O3-NEXT: SI optimize exec mask operations pre-RA
; GCN-O3-NEXT: SI Form memory clauses
; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O3-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Machine Block Frequency Analysis
; GCN-O3-NEXT: Debug Variable Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
new file mode 100644
index 0000000000000..91b06588aa2cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=0 < %s | FileCheck -check-prefix DEF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=1 < %s | FileCheck -check-prefix V1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=2 < %s | FileCheck -check-prefix V2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=3 < %s | FileCheck -check-prefix V3 %s
+
+define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out) {
+; DEF-LABEL: fadd_f32:
+; DEF: ; %bb.0: ; %entry
+; DEF-NEXT: s_add_f32 s3, s0, s1
+; DEF-NEXT: s_sub_f32 s1, s0, s1
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DEF-NEXT: v_dual_add_f32 v0, s3, v0 :: v_dual_add_f32 v1, s1, v1
+; DEF-NEXT: v_readfirstlane_b32 s0, v0
+; DEF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; DEF-NEXT: v_readfirstlane_b32 s4, v1
+; DEF-NEXT: v_mul_f32_e32 v4, v0, v1
+; DEF-NEXT: s_and_b32 s0, s0, s4
+; DEF-NEXT: global_store_b32 v[2:3], v4, off
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_cmp_lg_u32 s0, 0
+; DEF-NEXT: s_mov_b32 s0, 0
+; DEF-NEXT: s_cbranch_scc0 .LBB0_5
+; DEF-NEXT: ; %bb.1: ; %false
+; DEF-NEXT: s_mov_b32 s2, exec_lo
+; DEF-NEXT: v_add_f32_e32 v0, v0, v1
+; DEF-NEXT: s_buffer_load_b32 s4, s[0:3], 0x0
+; DEF-NEXT: s_and_b32 s1, s3, s1
+; DEF-NEXT: s_wait_kmcnt 0x0
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_lshl_b32 s1, s4, s1
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: v_cmp_ne_u32_e32 vcc_lo, s1, v1
+; DEF-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_and_not1_b32 s2, s2, s1
+; DEF-NEXT: s_cbranch_scc0 .LBB0_6
+; DEF-NEXT: ; %bb.2: ; %false
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DEF-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; DEF-NEXT: s_cbranch_vccnz .LBB0_4
+; DEF-NEXT: .LBB0_3: ; %true
+; DEF-NEXT: v_mul_f32_e32 v0, v1, v4
+; DEF-NEXT: .LBB0_4: ; %final
+; DEF-NEXT: s_branch .LBB0_7
+; DEF-NEXT: .LBB0_5:
+; DEF-NEXT: ; implicit-def: $vgpr0
+; DEF-NEXT: s_branch .LBB0_3
+; DEF-NEXT: .LBB0_6:
+; DEF-NEXT: s_mov_b32 exec_lo, 0
+; DEF-NEXT: export mrt0 off, off, off, off done
+; DEF-NEXT: s_endpgm
+; DEF-NEXT: .LBB0_7:
+;
+; V1-LABEL: fadd_f32:
+; V1: ; %bb.0: ; %entry
+; V1-NEXT: s_add_f32 s104, s0, s1
+; V1-NEXT: s_sub_f32 s103, s0, s1
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V1-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s103, v1
+; V1-NEXT: v_readfirstlane_b32 s0, v0
+; V1-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; V1-NEXT: v_readfirstlane_b32 s1, v1
+; V1-NEXT: v_mul_f32_e32 v4, v0, v1
+; V1-NEXT: s_and_b32 s0, s0, s1
+; V1-NEXT: global_store_b32 v[2:3], v4, off
+; V1-NEXT: s_cmp_lg_u32 s0, 0
+; V1-NEXT: s_mov_b32 s0, 0
...
[truncated]
|
✅ With the latest revision this PR passed the undef deprecator. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this need to be a dynamic property built on top of hints? We can statically change the allocation order in tablegen
|
||
// V1: simply reverse allocation order, mean 23% reduction in hazards | ||
if (Strategy == 1) { | ||
if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to use flags for this? The existing usage of the flags is already distasteful
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you have a better suggestion for precomputing information related to a register and hanging that information of the register itself then I'd be interested.
However, this use case seems well suited to flags.
Only the reversed allocation order could be generated as the others are based on the actual intersection of VALU and SALU register usage. |
- Scan for potential hazards in virtual registers before SGPR allocation. - Use this data to build new allocation order via allocation hints.
e8a461e
to
edcbb37
Compare
Ping |
for (const auto &MO : MRI->reg_nodbg_operands(Reg)) { | ||
const MachineInstr &MI = *MO.getParent(); | ||
if (SIInstrInfo::isVALU(MI) && MO.isUse()) { | ||
FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this usage of flags is appropriate. In the WWM case, we have a hard dependency on the flags propagating through a split / spill. That does not apply in this case. After a virtual register is split, do you really want the flag to apply to the copied register? The copy will still be an SGPR use.
If the extent of the analysis is do a scan over use operands, you can do this directly in getRegAllocationHints (Also I think this hook should be renamed. I didn't realize this was a wholly separate mechanism for changing the allocation order, compared to MachineRegisterInfo's addRegAllocationHint)
Additionally, I think we could look into consolidating the interfaces for controlling allocation order. We have different PriorityAdvisor types for MLRegAlloc, can we do something with that to modernize getRegAllocationHints?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think its unreasonable to merge the analysis into getRegAllocationHints
and remove the flag; however, doing a pass over the entire IR for every call to SGPR getRegAllocationHints
seemed terribly inefficient.
getRegAllocationHints
is also const
which means caching the analysis results will require some fiddling.
I'll revise this change in that direction.