[AMDGPU] Add option to bias SGPR allocation to reduce read hazards #129869

perlfu · Mar 5, 2025

Scan for potential hazards in virtual registers before SGPR allocation.
Use this data to build new allocation order via allocation hints.

llvmbot · Mar 5, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Carl Ritson (perlfu)

Changes

Scan for potential hazards in virtual registers before SGPR allocation.
Use this data to build new allocation order via allocation hints.

Patch is 25.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129869.diff

10 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+3)
(added) llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp (+102)
(added) llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h (+25)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+3)
(modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
(modified) llvm/lib/Target/AMDGPU/SIDefines.h (+1)
(modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+169-1)
(modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+7)
(modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+4)
(added) llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll (+229)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..ac3d683ffeea2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -478,6 +478,9 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+void initializeAMDGPUMarkSGPRHazardRegsLegacyPass(PassRegistry &);
+extern char &AMDGPUMarkSGPRHazardRegsLegacyID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
new file mode 100644
index 0000000000000..46dfcbb48e54f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
@@ -0,0 +1,102 @@
+//===- AMDGPUMarkSGPRHazardRegs.cpp - Annotate SGPRs used by VALU ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to mark SGPRs used by VALU.
+///       Marks can be used during register allocation to reduce hazards.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMarkSGPRHazardRegs.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mark-sgpr-hazard-regs"
+
+namespace {
+
+class AMDGPUMarkSGPRHazardRegs {
+public:
+  AMDGPUMarkSGPRHazardRegs() {}
+  bool run(MachineFunction &MF);
+};
+
+class AMDGPUMarkSGPRHazardRegsLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUMarkSGPRHazardRegsLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+    return AMDGPUMarkSGPRHazardRegs().run(MF);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+bool AMDGPUMarkSGPRHazardRegs::run(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasVALUReadSGPRHazard())
+    return false;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  if (!TRI->getSGPRHazardAvoidanceStrategy(MF))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "AMDGPUMarkSGPRHazardRegs: function " << MF.getName()
+                    << "\n");
+
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    const auto *RC = MRI->getRegClass(Reg);
+    if (!RC || !TRI->isSGPRClass(RC))
+      continue;
+    for (const auto &MO : MRI->reg_nodbg_operands(Reg)) {
+      const MachineInstr &MI = *MO.getParent();
+      if (SIInstrInfo::isVALU(MI) && MO.isUse()) {
+        FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+        break;
+      }
+    }
+  }
+
+  return true;
+}
+
+INITIALIZE_PASS(AMDGPUMarkSGPRHazardRegsLegacy, DEBUG_TYPE,
+                "AMDGPU Mark Hazard SGPRs", false, false)
+
+char AMDGPUMarkSGPRHazardRegsLegacy::ID = 0;
+
+char &llvm::AMDGPUMarkSGPRHazardRegsLegacyID =
+    AMDGPUMarkSGPRHazardRegsLegacy::ID;
+
+PreservedAnalyses
+AMDGPUMarkSGPRHazardRegsPass::run(MachineFunction &MF,
+                                  MachineFunctionAnalysisManager &MFAM) {
+  AMDGPUMarkSGPRHazardRegs().run(MF);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
new file mode 100644
index 0000000000000..89905ceb1185d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
@@ -0,0 +1,25 @@
+//===--- AMDGPUMarkSGPRHazardRegs.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUMarkSGPRHazardRegsPass
+    : public PassInfoMixin<AMDGPUMarkSGPRHazardRegsPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..889006b1086c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPUMacroFusion.h"
+#include "AMDGPUMarkSGPRHazardRegs.h"
 #include "AMDGPUOpenCLEnqueuedBlockLowering.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPURemoveIncompatibleFunctions.h"
@@ -560,6 +561,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGCNRegPressurePrinterPass(*PR);
   initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
   initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
+  initializeAMDGPUMarkSGPRHazardRegsLegacyPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1613,6 +1615,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 
   addPass(&GCNPreRALongBranchRegID);
 
+  addPass(&AMDGPUMarkSGPRHazardRegsLegacyID);
   addPass(createSGPRAllocPass(true));
 
   // Commit allocated register changes. This is mostly necessary because too
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 408da0536237e..83de8ac1c7535 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -85,6 +85,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
+  AMDGPUMarkSGPRHazardRegs.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
   AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 721601efcc804..bf91118ec148a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1066,6 +1066,7 @@ namespace VirtRegFlag {
 enum Register_Flag : uint8_t {
   // Register operand in a whole-wave mode operation.
   WWM_REG = 1 << 0,
+  SGPR_HAZARD_REG = 1 << 1
 };
 
 } // namespace VirtRegFlag
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 128cd8244a477..d210d701b9306 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -11,14 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SIRegisterInfo.h"
 #include "AMDGPU.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35,6 +36,10 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<unsigned> SGPRHazardAvoidanceStrategy(
+    "amdgpu-sgpr-hazard-regalloc", cl::init(0), cl::ReallyHidden,
+    cl::desc("Register allocation strategy to reduce SGPR read hazards"));
+
 std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
 
@@ -3904,3 +3909,166 @@ SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
     RegFlags.push_back("WWM_REG");
   return RegFlags;
 }
+
+unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy(
+    const MachineFunction &MF) const {
+  if (SGPRHazardAvoidanceStrategy.getNumOccurrences()) {
+    return SGPRHazardAvoidanceStrategy;
+  } else {
+    return MF.getFunction().getFnAttributeAsParsedInteger(
+        "amdgpu-sgpr-hazard-regalloc", 0);
+  }
+}
+
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+                                           ArrayRef<MCPhysReg> Order,
+                                           SmallVectorImpl<MCPhysReg> &Hints,
+                                           const MachineFunction &MF,
+                                           const VirtRegMap *VRM,
+                                           const LiveRegMatrix *Matrix) const {
+  bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+      VirtReg, Order, Hints, MF, VRM, Matrix);
+  if (!VRM)
+    return BaseImplRetVal;
+
+  // Only use hinting to reduce SGPR read hazards when required.
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasVALUReadSGPRHazard())
+    return BaseImplRetVal;
+
+  // Only treat SGPRs
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const auto *RC = MRI->getRegClass(VirtReg);
+  if (!isSGPRClass(RC))
+    return BaseImplRetVal;
+
+  const unsigned Strategy = getSGPRHazardAvoidanceStrategy(MF);
+  if (!Strategy)
+    return BaseImplRetVal;
+
+  SmallSet<MCPhysReg, 4> CopyHints;
+  CopyHints.insert(Hints.begin(), Hints.end());
+
+  auto AddHint = [&](MCPhysReg PhysReg) {
+    if (CopyHints.contains(PhysReg) || MRI->isReserved(PhysReg))
+      return;
+    Hints.push_back(PhysReg);
+  };
+  auto AddHints = [&](ArrayRef<MCPhysReg> Regs) {
+    for (MCPhysReg PhysReg : Regs)
+      AddHint(PhysReg);
+  };
+
+  // V1: simply reverse allocation order, mean 23% reduction in hazards
+  if (Strategy == 1) {
+    if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+      for (MCPhysReg PhysReg : reverse(Order))
+        AddHint(PhysReg);
+    } else {
+      for (MCPhysReg PhysReg : Order)
+        AddHint(PhysReg);
+    }
+    return true;
+  }
+
+  // Build set of current hazard pairs from live matrix
+  auto *LiveUnions = const_cast<LiveRegMatrix *>(Matrix)->getLiveUnions();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+  DenseMap<MCPhysReg, unsigned> IntervalCount;
+  std::bitset<64> HazardPairs;
+
+  for (MCPhysReg PhysReg : Order) {
+    SmallSet<const LiveInterval *, 4> Intervals;
+    bool IsHazard = false;
+    for (auto Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LiveUnions[Unit];
+      for (const LiveInterval *LI : LIU.getMap()) {
+        Intervals.insert(LI);
+        if (FuncInfo->checkFlag(LI->reg(),
+                                AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+          IsHazard = true;
+          // Break here as we only care about interval count for non-hazard regs
+          break;
+        }
+      }
+      if (IsHazard)
+        break;
+    }
+    if (IsHazard) {
+      unsigned PairN = TRI->getEncodingValue(PhysReg) >> 1;
+      if (PairN <= 63)
+        HazardPairs.set(PairN);
+    }
+    IntervalCount[PhysReg] = Intervals.size();
+  }
+
+  // V2: weight the entire order based on hazard free usage, mean 30% reduction
+  // in hazards
+  if (Strategy == 2) {
+    bool VRegIsHazard =
+        FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+    SmallVector<MCPhysReg> NewOrder(Order);
+    std::sort(NewOrder.begin(), NewOrder.end(), [&](MCPhysReg A, MCPhysReg B) {
+      return VRegIsHazard ? IntervalCount[A] < IntervalCount[B]
+                          : IntervalCount[B] < IntervalCount[A];
+    });
+    AddHints(NewOrder);
+    return true;
+  }
+
+  // V3: complex partitioning, mean 35% reduction in hazards
+  assert(Strategy == 3);
+
+  // Partition the allocation order based on hazards
+  SmallVector<MCPhysReg> Unallocated, UnallocatedWithHazard;
+  SmallVector<MCPhysReg> Allocated, AllocatedWithHazard;
+
+  for (MCPhysReg PhysReg : Order) {
+    Register VReg = Matrix->getOneVReg(PhysReg);
+    bool HasHazard = false;
+    // XXX: can remove regunit scan for just SGPR32/SGPR64
+    for (auto Unit : TRI->regunits(PhysReg)) {
+      unsigned PairN = TRI->getEncodingValue(Unit) >> 1;
+      if (PairN <= 63 && HazardPairs[PairN]) {
+        HasHazard = true;
+        break;
+      }
+    }
+    if (VReg == MCRegister::NoRegister) {
+      if (HasHazard)
+        UnallocatedWithHazard.push_back(PhysReg);
+      else
+        Unallocated.push_back(PhysReg);
+    } else {
+      if (HasHazard)
+        AllocatedWithHazard.push_back(PhysReg);
+      else
+        Allocated.push_back(PhysReg);
+    }
+  }
+
+  if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+    // Reorder allocations based on usage, so least used will be reused first.
+    // This means least used regs are touched by hazards first.
+    std::sort(Allocated.begin(), Allocated.end(),
+              [&](MCPhysReg A, MCPhysReg B) {
+                return IntervalCount[A] < IntervalCount[B];
+              });
+    // Reverse order of allocations to try to keep hazards away - yes it helps.
+    std::reverse(Unallocated.begin(), Unallocated.end());
+
+    AddHints(AllocatedWithHazard);
+    AddHints(UnallocatedWithHazard);
+    AddHints(Unallocated);
+    AddHints(Allocated);
+  } else {
+    AddHints(Allocated);
+    AddHints(Unallocated);
+    AddHints(UnallocatedWithHazard);
+    AddHints(AllocatedWithHazard);
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a64180daea2ad..5ccede9ca3529 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -465,6 +465,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 
   SmallVector<StringLiteral>
   getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+  unsigned getSGPRHazardAvoidanceStrategy(const MachineFunction &MF) const;
+
+  bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF, const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
 };
 
 namespace AMDGPU {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d7f54f3b8e9e2..b933286b8c6ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -357,6 +357,7 @@
 ; GCN-O1-NEXT:        SI Whole Quad Mode
 ; GCN-O1-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O1-NEXT:        AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-NEXT:        AMDGPU Mark Hazard SGPRs
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Machine Block Frequency Analysis
 ; GCN-O1-NEXT:        Debug Variable Analysis
@@ -670,6 +671,7 @@
 ; GCN-O1-OPTS-NEXT:        SI Whole Quad Mode
 ; GCN-O1-OPTS-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O1-OPTS-NEXT:        AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-OPTS-NEXT:        AMDGPU Mark Hazard SGPRs
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:        Debug Variable Analysis
@@ -989,6 +991,7 @@
 ; GCN-O2-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O2-NEXT:        SI Form memory clauses
 ; GCN-O2-NEXT:        AMDGPU Pre-RA Long Branch Reg
+; GCN-O2-NEXT:        AMDGPU Mark Hazard SGPRs
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Machine Block Frequency Analysis
 ; GCN-O2-NEXT:        Debug Variable Analysis
@@ -1321,6 +1324,7 @@
 ; GCN-O3-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O3-NEXT:        SI Form memory clauses
 ; GCN-O3-NEXT:        AMDGPU Pre-RA Long Branch Reg
+; GCN-O3-NEXT:        AMDGPU Mark Hazard SGPRs
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Machine Block Frequency Analysis
 ; GCN-O3-NEXT:        Debug Variable Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
new file mode 100644
index 0000000000000..91b06588aa2cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=0 < %s | FileCheck -check-prefix DEF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=1 < %s | FileCheck -check-prefix V1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=2 < %s | FileCheck -check-prefix V2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=3 < %s | FileCheck -check-prefix V3 %s
+
+define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out) {
+; DEF-LABEL: fadd_f32:
+; DEF:       ; %bb.0: ; %entry
+; DEF-NEXT:    s_add_f32 s3, s0, s1
+; DEF-NEXT:    s_sub_f32 s1, s0, s1
+; DEF-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DEF-NEXT:    v_dual_add_f32 v0, s3, v0 :: v_dual_add_f32 v1, s1, v1
+; DEF-NEXT:    v_readfirstlane_b32 s0, v0
+; DEF-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; DEF-NEXT:    v_readfirstlane_b32 s4, v1
+; DEF-NEXT:    v_mul_f32_e32 v4, v0, v1
+; DEF-NEXT:    s_and_b32 s0, s0, s4
+; DEF-NEXT:    global_store_b32 v[2:3], v4, off
+; DEF-NEXT:    s_wait_alu 0xfffe
+; DEF-NEXT:    s_cmp_lg_u32 s0, 0
+; DEF-NEXT:    s_mov_b32 s0, 0
+; DEF-NEXT:    s_cbranch_scc0 .LBB0_5
+; DEF-NEXT:  ; %bb.1: ; %false
+; DEF-NEXT:    s_mov_b32 s2, exec_lo
+; DEF-NEXT:    v_add_f32_e32 v0, v0, v1
+; DEF-NEXT:    s_buffer_load_b32 s4, s[0:3], 0x0
+; DEF-NEXT:    s_and_b32 s1, s3, s1
+; DEF-NEXT:    s_wait_kmcnt 0x0
+; DEF-NEXT:    s_wait_alu 0xfffe
+; DEF-NEXT:    s_lshl_b32 s1, s4, s1
+; DEF-NEXT:    s_wait_alu 0xfffe
+; DEF-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s1, v1
+; DEF-NEXT:    s_and_not1_b32 s1, exec_lo, vcc_lo
+; DEF-NEXT:    s_wait_alu 0xfffe
+; DEF-NEXT:    s_and_not1_b32 s2, s2, s1
+; DEF-NEXT:    s_cbranch_scc0 .LBB0_6
+; DEF-NEXT:  ; %bb.2: ; %false
+; DEF-NEXT:    s_wait_alu 0xfffe
+; DEF-NEXT:    s_and_b32 exec_lo, exec_lo, s2
+; DEF-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DEF-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; DEF-NEXT:    s_cbranch_vccnz .LBB0_4
+; DEF-NEXT:  .LBB0_3: ; %true
+; DEF-NEXT:    v_mul_f32_e32 v0, v1, v4
+; DEF-NEXT:  .LBB0_4: ; %final
+; DEF-NEXT:    s_branch .LBB0_7
+; DEF-NEXT:  .LBB0_5:
+; DEF-NEXT:    ; implicit-def: $vgpr0
+; DEF-NEXT:    s_branch .LBB0_3
+; DEF-NEXT:  .LBB0_6:
+; DEF-NEXT:    s_mov_b32 exec_lo, 0
+; DEF-NEXT:    export mrt0 off, off, off, off done
+; DEF-NEXT:    s_endpgm
+; DEF-NEXT:  .LBB0_7:
+;
+; V1-LABEL: fadd_f32:
+; V1:       ; %bb.0: ; %entry
+; V1-NEXT:    s_add_f32 s104, s0, s1
+; V1-NEXT:    s_sub_f32 s103, s0, s1
+; V1-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V1-NEXT:    v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s103, v1
+; V1-NEXT:    v_readfirstlane_b32 s0, v0
+; V1-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; V1-NEXT:    v_readfirstlane_b32 s1, v1
+; V1-NEXT:    v_mul_f32_e32 v4, v0, v1
+; V1-NEXT:    s_and_b32 s0, s0, s1
+; V1-NEXT:    global_store_b32 v[2:3], v4, off
+; V1-NEXT:    s_cmp_lg_u32 s0, 0
+; V1-NEXT:    s_mov_b32 s0, 0
...
[truncated]

github-actions · Mar 5, 2025

✅ With the latest revision this PR passed the undef deprecator.

arsenm

Why does this need to be a dynamic property built on top of hints? We can statically change the allocation order in tablegen

arsenm · Mar 5, 2025

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

+
+  // V1: simply reverse allocation order, mean 23% reduction in hazards
+  if (Strategy == 1) {
+    if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {


Why do we need to use flags for this? The existing usage of the flags is already distasteful

If you have a better suggestion for precomputing information related to a register and hanging that information of the register itself then I'd be interested.
However, this use case seems well suited to flags.

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

perlfu · Mar 6, 2025

Why does this need to be a dynamic property built on top of hints? We can statically change the allocation order in tablegen

Only the reversed allocation order could be generated as the others are based on the actual intersection of VALU and SALU register usage.
We could generate that order and switch all GFX12.
It has the side effect that all waves require maximum SGPR allocation, although this is not an issue for RDNA* where these are always available.

perlfu · Apr 2, 2025

Rebase

Ping

arsenm · Apr 2, 2025

llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp

+    for (const auto &MO : MRI->reg_nodbg_operands(Reg)) {
+      const MachineInstr &MI = *MO.getParent();
+      if (SIInstrInfo::isVALU(MI) && MO.isUse()) {
+        FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);


I don't think this usage of flags is appropriate. In the WWM case, we have a hard dependency on the flags propagating through a split / spill. That does not apply in this case. After a virtual register is split, do you really want the flag to apply to the copied register? The copy will still be an SGPR use.

If the extent of the analysis is do a scan over use operands, you can do this directly in getRegAllocationHints (Also I think this hook should be renamed. I didn't realize this was a wholly separate mechanism for changing the allocation order, compared to MachineRegisterInfo's addRegAllocationHint)

Additionally, I think we could look into consolidating the interfaces for controlling allocation order. We have different PriorityAdvisor types for MLRegAlloc, can we do something with that to modernize getRegAllocationHints?

I don't think its unreasonable to merge the analysis into getRegAllocationHints and remove the flag; however, doing a pass over the entire IR for every call to SGPR getRegAllocationHints seemed terribly inefficient.
getRegAllocationHints is also const which means caching the analysis results will require some fiddling.
I'll revise this change in that direction.

- Scan for potential hazards in virtual registers before SGPR allocation. - Use this data to build new allocation order via allocation hints.

perlfu · Jun 16, 2025

Rebase
Fold analysis into getRegAllocationHints

To clear up any confusion, getRegAllocationHints in MRI is what creates the base order returned by TRI getRegAllocationHints.
Essentially the MRI methods setup static hints, and the TRI getRegAllocationHints provides dynamic adjustments to order during allocation.
PriorityAdvisor is informing the order live intervals are selected for assignment, not the order physical registers are considered when allocating for a given register.

We could envision moving getRegAllocationHints to some kind of Analysis and Advisor infrastructure.
Although the results of that advisor would be more than advise, but an absolute list of register to allocate from.
Again, such a change is adjunct to this patch.

perlfu requested review from jayfoad, arsenm, rampitec and dstutt March 5, 2025 11:21

llvmbot added the backend:AMDGPU label Mar 5, 2025

arsenm reviewed Mar 5, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp Outdated Show resolved Hide resolved

perlfu force-pushed the amdgpu-avoid-sgpr-hazards branch from e8a461e to edcbb37 Compare April 2, 2025 02:24

arsenm reviewed Apr 2, 2025

View reviewed changes

perlfu added 2 commits June 16, 2025 10:20

[AMDGPU] Add option to bias SGPR allocation to reduce read hazards

7ad9eee

- Scan for potential hazards in virtual registers before SGPR allocation. - Use this data to build new allocation order via allocation hints.

- Fold analysis into getAllocationHints

c6ccbb7

perlfu force-pushed the amdgpu-avoid-sgpr-hazards branch from edcbb37 to c6ccbb7 Compare June 16, 2025 04:52

- Remove register flag definition

31d6eb9

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Add option to bias SGPR allocation to reduce read hazards #129869

[AMDGPU] Add option to bias SGPR allocation to reduce read hazards #129869

perlfu commented Mar 5, 2025

Uh oh!

llvmbot commented Mar 5, 2025

Uh oh!

github-actions bot commented Mar 5, 2025 •

edited

Loading

Uh oh!

arsenm left a comment

Uh oh!

arsenm Mar 5, 2025

Uh oh!

perlfu Mar 6, 2025

Uh oh!

Uh oh!

perlfu commented Mar 6, 2025

Uh oh!

perlfu commented Apr 2, 2025

Uh oh!

arsenm Apr 2, 2025

Uh oh!

perlfu May 14, 2025

Uh oh!

perlfu commented Jun 16, 2025

Uh oh!

Uh oh!

Search code, repositories, users, issues, pull requests...

[AMDGPU] Add option to bias SGPR allocation to reduce read hazards #129869

Are you sure you want to change the base?

[AMDGPU] Add option to bias SGPR allocation to reduce read hazards #129869

Conversation

perlfu commented Mar 5, 2025

Uh oh!

llvmbot commented Mar 5, 2025

Uh oh!

github-actions bot commented Mar 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm Mar 5, 2025

Choose a reason for hiding this comment

Uh oh!

perlfu Mar 6, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

perlfu commented Mar 6, 2025

Uh oh!

perlfu commented Apr 2, 2025

Uh oh!

arsenm Apr 2, 2025

Choose a reason for hiding this comment

Uh oh!

perlfu May 14, 2025

Choose a reason for hiding this comment

Uh oh!

perlfu commented Jun 16, 2025

Uh oh!

Uh oh!

github-actions bot commented Mar 5, 2025 •

edited

Loading