diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aba3c0f80a024..297f647575bff 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -419,6 +419,7 @@ namespace { SDValue visitADDLike(SDNode *N); SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); + SDValue visitPTRADD(SDNode *N); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); @@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return true; } - if (Opc != ISD::ADD) + if (Opc != ISD::ADD && Opc != ISD::PTRADD) return false; auto *C2 = dyn_cast(N1); @@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::TokenFactor: return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD: return visitADD(N); + case ISD::PTRADD: return visitPTRADD(N); case ISD::SUB: return visitSUB(N); case ISD::SADDSAT: case ISD::UADDSAT: return visitADDSAT(N); @@ -2627,6 +2629,87 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) { return SDValue(); } +/// Try to fold a pointer arithmetic node. +/// This needs to be done separately from normal addition, because pointer +/// addition is not commutative. +SDValue DAGCombiner::visitPTRADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT PtrVT = N0.getValueType(); + EVT IntVT = N1.getValueType(); + SDLoc DL(N); + + // This is already ensured by an assert in SelectionDAG::getNode(). Several + // combines here depend on this assumption. + assert(PtrVT == IntVT && + "PTRADD with different operand types is not supported"); + + // fold (ptradd x, 0) -> x + if (isNullConstant(N1)) + return N0; + + // fold (ptradd 0, x) -> x + if (isNullConstant(N0) && PtrVT == IntVT) + return N1; + + if (N0.getOpcode() != ISD::PTRADD || + reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) + return SDValue(); + + SDValue X = N0.getOperand(0); + SDValue Y = N0.getOperand(1); + SDValue Z = N1; + bool N0OneUse = N0.hasOneUse(); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + // (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if: + // * y is a constant and (ptradd x, y) has one use; or + // * y and z are both constants. + if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && N0->getFlags().hasNoUnsignedWrap()) + Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); + AddToWorklist(Add.getNode()); + return DAG.getMemBasePlusOffset(X, Add, DL, Flags); + } + + // TODO: There is another possible fold here that was proven useful. + // It would be this: + // + // (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if: + // * (ptradd x, y) has one use; and + // * y is a constant; and + // * z is not a constant. + // + // In some cases, specifically in AArch64's FEAT_CPA, it exposes the + // opportunity to select more complex instructions such as SUBPT and + // MSUBPT. However, a hypothetical corner case has been found that we could + // not avoid. Consider this (pseudo-POSIX C): + // + // char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;} + // char *p = mmap(LARGE_CONSTANT); + // char *q = foo(p, -LARGE_CONSTANT); + // + // Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a + // further + z takes it back to the start of the mapping, so valid, + // regardless of the address mmap gave back. However, if mmap gives you an + // address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will + // borrow from the high bits (with the subsequent + z carrying back into + // the high bits to give you a well-defined pointer) and thus trip + // FEAT_CPA's pointer corruption checks. + // + // We leave this fold as an opportunity for future work, addressing the + // corner case for FEAT_CPA, as well as reconciling the solution with the + // more general application of pointer arithmetic in other future targets. + // For now each architecture that wants this fold must implement it in the + // target-specific code (see e.g. SITargetLowering::performPtrAddCombine) + + return SDValue(); +} + /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into /// a shift and add with a different constant. static SDValue foldAddSubOfSignBit(SDNode *N, const SDLoc &DL, @@ -14955,6 +15038,7 @@ SDValue DAGCombiner::visitAssertAlign(SDNode *N) { default: break; case ISD::ADD: + case ISD::PTRADD: case ISD::SUB: { unsigned AlignShift = Log2(AL); SDValue LHS = N0.getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6c736f1f13fda..0708c76bcb3fc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -938,6 +938,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); setTargetDAGCombine({ISD::ADD, + ISD::PTRADD, ISD::UADDO_CARRY, ISD::SUB, ISD::USUBO_CARRY, @@ -14935,6 +14936,51 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performPtrAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N1.getOpcode() == ISD::ADD) { + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool N1OneUse = N1.hasOneUse(); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + if ((ZIsConstant != YIsConstant) && N1OneUse) { + SDNodeFlags Flags; + // If both additions in the original were NUW, the new ones are as well. + if (N->getFlags().hasNoUnsignedWrap() && + N1->getFlags().hasNoUnsignedWrap()) + Flags |= SDNodeFlags::NoUnsignedWrap; + + if (YIsConstant) + std::swap(Y, Z); + + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + } + } + + return SDValue(); +} + SDValue SITargetLowering::performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -15467,6 +15513,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { case ISD::ADD: return performAddCombine(N, DCI); + case ISD::PTRADD: + return performPtrAddCombine(N, DCI); case ISD::SUB: return performSubCombine(N, DCI); case ISD::UADDO_CARRY: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 5321f36fcba68..5368cf23be769 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -218,6 +218,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { DAGCombinerInfo &DCI) const; SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 4a5fa641da038..b78dea1684545 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -8,24 +8,14 @@ ; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: global_load_ZTwoUses: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 24 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: global_load_ZTwoUses: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_load_ZTwoUses: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset %l = load i64, ptr addrspace(1) %gep1, align 8 @@ -37,9 +27,8 @@ define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { ; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: ; GFX942_PTRADD: ; %bb.0: ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 24 ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] ; @@ -60,69 +49,36 @@ define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { ; would be folded away in most cases, but the index computation introduced by ; the legalization of wide vector stores can for example introduce them. define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { -; GFX942_PTRADD-LABEL: store_v16i32: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v4, 0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_add_u32 s2, s0, 32 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s20 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s21 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s22 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s23 -; GFX942_PTRADD-NEXT: s_addc_u32 s3, s1, 0 -; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX942_PTRADD-NEXT: s_nop 1 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s16 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s17 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s18 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s19 -; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX942_PTRADD-NEXT: s_nop 1 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s12 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s13 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s14 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s15 -; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942_PTRADD-NEXT: s_nop 1 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s8 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s9 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s10 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s11 -; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: store_v16i32: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v4, 0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s20 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s21 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s22 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s23 -; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX942_LEGACY-NEXT: s_nop 1 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s16 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s17 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s18 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s19 -; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 -; GFX942_LEGACY-NEXT: s_nop 1 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s12 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s13 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s14 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s15 -; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942_LEGACY-NEXT: s_nop 1 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s8 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s9 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s10 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s11 -; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: store_v16i32: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s20 +; GFX942-NEXT: v_mov_b32_e32 v1, s21 +; GFX942-NEXT: v_mov_b32_e32 v2, s22 +; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: v_mov_b32_e32 v2, s14 +; GFX942-NEXT: v_mov_b32_e32 v3, s15 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, s8 +; GFX942-NEXT: v_mov_b32_e32 v1, s9 +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_endpgm entry: store <16 x i32> %a, ptr addrspace(1) %out ret void @@ -131,20 +87,12 @@ entry: ; Tests the (ptradd 0, x) -> x DAG combine. define void @baseptr_null(i64 %offset, i8 %v) { -; GFX942_PTRADD-LABEL: baseptr_null: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], 0, 0, v[0:1] -; GFX942_PTRADD-NEXT: flat_store_byte v[0:1], v2 -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: baseptr_null: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: flat_store_byte v[0:1], v2 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: baseptr_null: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_store_byte v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr null, i64 %offset store i8 %v, ptr %gep, align 1 ret void @@ -153,40 +101,21 @@ define void @baseptr_null(i64 %offset, i8 %v) { ; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the ; assertalign DAG combine. define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { -; GFX942_PTRADD-LABEL: llvm_amdgcn_queue_ptr: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_add_u32 s8, s4, 8 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 0 -; GFX942_PTRADD-NEXT: s_addc_u32 s9, s5, 0 -; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1 -; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[8:9] sc0 sc1 -; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1 -; GFX942_PTRADD-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942_PTRADD-NEXT: ; kill: killed $sgpr8 killed $sgpr9 -; GFX942_PTRADD-NEXT: ; kill: killed $sgpr2_sgpr3 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: llvm_amdgcn_queue_ptr: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 0 -; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1 -; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1 -; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1 -; GFX942_LEGACY-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942_LEGACY-NEXT: ; kill: killed $sgpr2_sgpr3 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: llvm_amdgcn_queue_ptr: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1 +; GFX942-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1 +; GFX942-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1 +; GFX942-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -197,6 +126,3 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { store volatile i64 %dispatch.id, ptr addrspace(1) %ptr ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX942: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll new file mode 100644 index 0000000000000..1934ce395e63d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s + +; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode. +; If any additions are generated for these tests, the folds don't work. + +define ptr @poison_offset(ptr %p, i64 %offset) { +; GFX942-LABEL: poison_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep1 = getelementptr i8, ptr %p, i64 poison + ret ptr %gep1 +} + +define ptr @poison_base(ptr %p, i64 %offset) { +; GFX942-LABEL: poison_base: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep1 = getelementptr i8, ptr poison, i64 %offset + ret ptr %gep1 +} + +define ptr @undef_offset(ptr %p, i64 %offset) { +; GFX942-LABEL: undef_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep1 = getelementptr i8, ptr %p, i64 undef + ret ptr %gep1 +} + +define ptr @undef_base(ptr %p, i64 %offset) { +; GFX942-LABEL: undef_base: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep1 = getelementptr i8, ptr undef, i64 %offset + ret ptr %gep1 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX942_LEGACY: {{.*}} +; GFX942_PTRADD: {{.*}}