From d32f06051c3538c6bca47cf31d818626f66faa81 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 12 May 2025 05:39:17 -0400 Subject: [PATCH 01/10] [AMDGPU] Add tests that demonstrates si-peephole-sdwa failure on V_CNDMASK --- .../AMDGPU/sdwa-peephole-cndmask-fail.ll | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll new file mode 100644 index 0000000000000..1f7706b8f16c3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -0,0 +1,51 @@ +; RUN: llc %s -march=amdgcn -mcpu=gfx1030 -o - 2>&1 | FileCheck %s +; XFAIL: * + +; V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, but the +; expected conversion to SDWA does not occur. FIXME This leads to a +; compilation error, because the use of $vcc in the resulting +; instruction must be fixed to $vcc_lo for wave32. This only happens +; after the full conversion to SDWA. + + +; CHECK-NOT: {{.*}}V_CNDMASK_B32_e32{{.*}}$vcc +; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses + +; ModuleID = 'test.ll' +source_filename = "test.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) #0 { +bb: + br i1 %arg1, label %bb9, label %bb3 + +bb3: ; preds = %bb + %call = tail call i32 @llvm.amdgcn.workitem.id.x() + %mul = mul i32 %call, 5 + %zext = zext i32 %mul to i64 + %getelementptr = getelementptr i8, ptr addrspace(1) null, i64 %zext + %getelementptr4 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 4 + %load = load i8, ptr addrspace(1) %getelementptr4, align 1 + %getelementptr5 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 3 + %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1 + %insertelement = insertelement <5 x i8> poison, i8 %load, i64 4 + %select = select i1 %arg2, <5 x i8> %insertelement, <5 x i8> + %insertelement7 = insertelement <5 x i8> %select, i8 %load6, i64 0 + %icmp = icmp ult i32 0, %arg + %select8 = select i1 %icmp, <5 x i8> zeroinitializer, <5 x i8> %insertelement7 + %shufflevector = shufflevector <5 x i8> zeroinitializer, <5 x i8> %select8, <5 x i32> + br label %bb9 + +bb9: ; preds = %bb3, %bb + %phi = phi <5 x i8> [ %shufflevector, %bb3 ], [ zeroinitializer, %bb ] + %extractelement = extractelement <5 x i8> %phi, i64 0 + store i8 %extractelement, ptr addrspace(1) null, align 1 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { "target-cpu"="gfx1030" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1030" } From 85e8efa7964ba13a4de88d17c1230186a055a5b0 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 12 May 2025 07:16:09 -0400 Subject: [PATCH 02/10] [AMDGPU] si-peephole-sdwa: Fix cndmask vcc use for wave32 --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 1 + llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 8eb1d7253cd48..bd8baaaa3df20 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1105,6 +1105,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) .setMIFlags(MI.getFlags()); + TII->fixImplicitOperands(*Converted); LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); (void)Converted; MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index 1f7706b8f16c3..9ab5a31b52441 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -1,15 +1,15 @@ ; RUN: llc %s -march=amdgcn -mcpu=gfx1030 -o - 2>&1 | FileCheck %s -; XFAIL: * -; V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, but the -; expected conversion to SDWA does not occur. FIXME This leads to a +; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, +; but the expected conversion to SDWA does not occur. This led to a ; compilation error, because the use of $vcc in the resulting -; instruction must be fixed to $vcc_lo for wave32. This only happens +; instruction must be fixed to $vcc_lo for wave32 which only happened ; after the full conversion to SDWA. ; CHECK-NOT: {{.*}}V_CNDMASK_B32_e32{{.*}}$vcc ; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses +; CHECK: {{.*}}v_cndmask_b32_e32{{.*}}vcc_lo ; ModuleID = 'test.ll' source_filename = "test.ll" From dc740cea3c32fd9cfe99301f3535195beff04b76 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 12 May 2025 08:20:43 -0400 Subject: [PATCH 03/10] Clean up test --- .../CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index 9ab5a31b52441..c3f1f1cf7950c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -11,12 +11,7 @@ ; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses ; CHECK: {{.*}}v_cndmask_b32_e32{{.*}}vcc_lo -; ModuleID = 'test.ll' -source_filename = "test.ll" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" -target triple = "amdgcn-amd-amdhsa" - -define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) #0 { +define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) { bb: br i1 %arg1, label %bb9, label %bb3 @@ -42,10 +37,4 @@ bb9: ; preds = %bb3, %bb %extractelement = extractelement <5 x i8> %phi, i64 0 store i8 %extractelement, ptr addrspace(1) null, align 1 ret void -} - -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 - -attributes #0 = { "target-cpu"="gfx1030" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1030" } +} \ No newline at end of file From 6f2426344000352914625af58caa24f650a5b643 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 12 May 2025 14:46:51 +0200 Subject: [PATCH 04/10] Update llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll Co-authored-by: Matt Arsenault --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index c3f1f1cf7950c..e95778a1e1759 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -37,4 +37,4 @@ bb9: ; preds = %bb3, %bb %extractelement = extractelement <5 x i8> %phi, i64 0 store i8 %extractelement, ptr addrspace(1) null, align 1 ret void -} \ No newline at end of file +} From 4b52ec535329ab21d10660d5e78ae1bfc75afbcd Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 12 May 2025 11:07:33 -0400 Subject: [PATCH 05/10] Add mir test --- .../AMDGPU/sdwa-peephole-cndmask-wave32.mir | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir index 4b45c54a3b83d..34a2c8735a7cb 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir @@ -230,3 +230,92 @@ body: | $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 ... + +--- +name: cndmask-not-converted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cndmask-not-converted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr8_sgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 + ; CHECK-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc + ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_CSELECT_B32_]], implicit-def dead $scc + ; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_]] + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[COPY1]](s32), 5, 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_U32_U24_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE]], 3, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_USHORT]], 255, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_MOV_B32_e32_2]], 0, [[GLOBAL_LOAD_USHORT]], 0, 6, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY2]].sub0, 0, implicit-def $scc + ; CHECK-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; CHECK-NEXT: $vcc_lo = COPY [[S_CSELECT_B32_1]] + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_AND_B32_sdwa]], implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 24, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_MOV_B32_e32_3]], 0, [[V_CNDMASK_B32_e32_]], 0, 1, 0, 6, 6, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CNDMASK_B32_e32_]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_CNDMASK_B32_e32_]], 0, [[V_MOV_B32_e32_4]], 0, 6, 0, 5, 6, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa1]], 0, [[V_LSHRREV_B32_sdwa]], 0, 5, 0, 6, 6, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_OR_B32_sdwa]], %bb.1 + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], [[PHI]], 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $vgpr0, $sgpr8_sgpr9 + + %0:sgpr_64(p4) = COPY $sgpr8_sgpr9 + %1:vgpr_32(s32) = COPY $vgpr0 + %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0 + S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc + %3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_32 = S_AND_B32 $exec_lo, %3, implicit-def dead $scc + $vcc_lo = COPY %5 + S_CBRANCH_VCCNZ %bb.2, implicit $vcc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + %6:sreg_64 = COPY %2 + %7:vgpr_32 = V_MUL_U32_U24_e64 %1(s32), 5, 0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, killed %8, %subreg.sub1 + %10:vgpr_32 = GLOBAL_LOAD_USHORT %9, 3, 0, implicit $exec + %11:vgpr_32 = V_AND_B32_e64 %10, 255, implicit $exec + %12:vgpr_32 = V_AND_B32_e64 65535, killed %11, implicit $exec + S_CMP_EQ_U32 %6.sub0, 0, implicit-def $scc + %13:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %14:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %12, %13, implicit $exec + %15:vgpr_32 = V_LSHRREV_B32_e64 24, %14, implicit $exec + %16:vgpr_32 = V_LSHLREV_B16_e64 8, %15, implicit $exec + %17:vgpr_32 = V_LSHRREV_B32_e64 16, %14, implicit $exec + %18:vgpr_32 = V_AND_B32_e64 %17, 255, implicit $exec + %19:vgpr_32 = V_OR_B32_e64 killed %18, killed %16, implicit $exec + %20:vgpr_32 = V_LSHLREV_B32_e64 16, killed %19, implicit $exec + + bb.2: + %21:vgpr_32 = PHI %4, %bb.0, %20, %bb.1 + %22:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + GLOBAL_STORE_BYTE killed %22, %21, 0, 0, implicit $exec + S_ENDPGM 0 +... From b5f3df9df196507cdea8c2e22f817e6138cf0669 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 13 May 2025 09:15:36 +0200 Subject: [PATCH 06/10] Update llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir Co-authored-by: Matt Arsenault --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir index 34a2c8735a7cb..1db3f1eef5667 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir @@ -282,9 +282,9 @@ body: | successors: %bb.1(0x40000000), %bb.2(0x40000000) liveins: $vgpr0, $sgpr8_sgpr9 - %0:sgpr_64(p4) = COPY $sgpr8_sgpr9 - %1:vgpr_32(s32) = COPY $vgpr0 - %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0 + %0:sgpr_64 = COPY $sgpr8_sgpr9 + %1:vgpr_32 = COPY $vgpr0 + %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc %3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec From d68738f955c802bbef61c85899dabc81b3c4aeb1 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 13 May 2025 09:16:01 +0200 Subject: [PATCH 07/10] Update llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll Co-authored-by: Matt Arsenault --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index e95778a1e1759..04b83e95ba595 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -1,4 +1,4 @@ -; RUN: llc %s -march=amdgcn -mcpu=gfx1030 -o - 2>&1 | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s ; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, ; but the expected conversion to SDWA does not occur. This led to a From 5b1a15804be0221b4937b6938a3f4c74c228a048 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 13 May 2025 08:21:10 +0000 Subject: [PATCH 08/10] Simplify and correct ll test - Must use -mtriple to reproduce the bug on the unfixed branch - Function does not need to be a kernel --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index 04b83e95ba595..a1db6a6a308dd 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s ; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, ; but the expected conversion to SDWA does not occur. This led to a @@ -6,12 +6,7 @@ ; instruction must be fixed to $vcc_lo for wave32 which only happened ; after the full conversion to SDWA. - -; CHECK-NOT: {{.*}}V_CNDMASK_B32_e32{{.*}}$vcc -; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses -; CHECK: {{.*}}v_cndmask_b32_e32{{.*}}vcc_lo - -define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) { +define void @quux(i32 %arg, i1 %arg1, i1 %arg2) { bb: br i1 %arg1, label %bb9, label %bb3 From 3a1e535911c8dd9d9857d5a6c1d593b593f02a44 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 13 May 2025 10:01:26 +0000 Subject: [PATCH 09/10] Update test expectations --- .../AMDGPU/sdwa-peephole-cndmask-fail.ll | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index a1db6a6a308dd..1c2d07c2f7af5 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s ; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, @@ -7,6 +8,35 @@ ; after the full conversion to SDWA. define void @quux(i32 %arg, i1 %arg1, i1 %arg2) { +; CHECK-LABEL: quux: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %bb3 +; CHECK-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 +; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:3 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xffff +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_and_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; CHECK-NEXT: v_mov_b32_e32 v1, 24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xff +; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; CHECK-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CHECK-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; CHECK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CHECK-NEXT: .LBB0_2: ; %bb9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: global_store_byte v[2:3], v1, off +; CHECK-NEXT: s_setpc_b64 s[30:31] bb: br i1 %arg1, label %bb9, label %bb3 From 245792d3980b95e1f547c01c3b7539648acbd439 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 13 May 2025 12:55:18 +0000 Subject: [PATCH 10/10] Update mir test --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir index 1db3f1eef5667..aef392749498a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir @@ -240,9 +240,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vgpr0, $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 ; CHECK-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec