Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 1377535

Browse filesBrowse files
frederik-harsenm
andauthored
[AMDGPU] si-peephole-sdwa: Fix cndmask vcc use for wave32 (#139541)
Before V_CNDMASK_B32_e64 gets converted to SDWA form, a conversion to V_CNDMASK_B32_e32 occurs. The vcc use of this instruction must be fixed into a vcc_lo use for wave32. This fix only happens after the final conversion to the SDWA form. This led to a compiler error in situations where the conversion to SDWA aborts. Make sure that the vcc-fix gets applied even if the SDWA conversion is not completed. --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
1 parent 0b490f1 commit 1377535
Copy full SHA for 1377535

File tree

Expand file treeCollapse file tree

3 files changed

+155
-0
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+155
-0
lines changed

‎llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Copy file name to clipboardExpand all lines: llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
11051105
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
11061106
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
11071107
.setMIFlags(MI.getFlags());
1108+
TII->fixImplicitOperands(*Converted);
11081109
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
11091110
(void)Converted;
11101111
MI.eraseFromParent();
+65Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s
3+
4+
; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32,
5+
; but the expected conversion to SDWA does not occur. This led to a
6+
; compilation error, because the use of $vcc in the resulting
7+
; instruction must be fixed to $vcc_lo for wave32 which only happened
8+
; after the full conversion to SDWA.
9+
10+
define void @quux(i32 %arg, i1 %arg1, i1 %arg2) {
11+
; CHECK-LABEL: quux:
12+
; CHECK: ; %bb.0: ; %bb
13+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
15+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1
16+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
17+
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
18+
; CHECK-NEXT: s_cbranch_execz .LBB0_2
19+
; CHECK-NEXT: ; %bb.1: ; %bb3
20+
; CHECK-NEXT: v_and_b32_e32 v1, 0x3ff, v31
21+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
22+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
23+
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1
24+
; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:3
25+
; CHECK-NEXT: v_mov_b32_e32 v2, 0xffff
26+
; CHECK-NEXT: s_waitcnt vmcnt(0)
27+
; CHECK-NEXT: v_and_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
28+
; CHECK-NEXT: v_mov_b32_e32 v1, 24
29+
; CHECK-NEXT: v_mov_b32_e32 v2, 0xff
30+
; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
31+
; CHECK-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
32+
; CHECK-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
33+
; CHECK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
34+
; CHECK-NEXT: .LBB0_2: ; %bb9
35+
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
36+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
37+
; CHECK-NEXT: v_mov_b32_e32 v3, 0
38+
; CHECK-NEXT: global_store_byte v[2:3], v1, off
39+
; CHECK-NEXT: s_setpc_b64 s[30:31]
40+
bb:
41+
br i1 %arg1, label %bb9, label %bb3
42+
43+
bb3: ; preds = %bb
44+
%call = tail call i32 @llvm.amdgcn.workitem.id.x()
45+
%mul = mul i32 %call, 5
46+
%zext = zext i32 %mul to i64
47+
%getelementptr = getelementptr i8, ptr addrspace(1) null, i64 %zext
48+
%getelementptr4 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 4
49+
%load = load i8, ptr addrspace(1) %getelementptr4, align 1
50+
%getelementptr5 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 3
51+
%load6 = load i8, ptr addrspace(1) %getelementptr5, align 1
52+
%insertelement = insertelement <5 x i8> poison, i8 %load, i64 4
53+
%select = select i1 %arg2, <5 x i8> %insertelement, <5 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0>
54+
%insertelement7 = insertelement <5 x i8> %select, i8 %load6, i64 0
55+
%icmp = icmp ult i32 0, %arg
56+
%select8 = select i1 %icmp, <5 x i8> zeroinitializer, <5 x i8> %insertelement7
57+
%shufflevector = shufflevector <5 x i8> zeroinitializer, <5 x i8> %select8, <5 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9>
58+
br label %bb9
59+
60+
bb9: ; preds = %bb3, %bb
61+
%phi = phi <5 x i8> [ %shufflevector, %bb3 ], [ zeroinitializer, %bb ]
62+
%extractelement = extractelement <5 x i8> %phi, i64 0
63+
store i8 %extractelement, ptr addrspace(1) null, align 1
64+
ret void
65+
}

‎llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
+89Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,92 @@ body: |
230230
$vgpr0 = COPY %3
231231
SI_RETURN implicit $vgpr0
232232
...
233+
234+
---
235+
name: cndmask-not-converted
236+
tracksRegLiveness: true
237+
body: |
238+
; CHECK-LABEL: name: cndmask-not-converted
239+
; CHECK: bb.0:
240+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
241+
; CHECK-NEXT: liveins: $vgpr0, $sgpr8_sgpr9
242+
; CHECK-NEXT: {{ $}}
243+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
244+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
245+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0
246+
; CHECK-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc
247+
; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
248+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
249+
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_CSELECT_B32_]], implicit-def dead $scc
250+
; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_]]
251+
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo
252+
; CHECK-NEXT: S_BRANCH %bb.1
253+
; CHECK-NEXT: {{ $}}
254+
; CHECK-NEXT: bb.1:
255+
; CHECK-NEXT: successors: %bb.2(0x80000000)
256+
; CHECK-NEXT: {{ $}}
257+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
258+
; CHECK-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[COPY1]](s32), 5, 0, implicit $exec
259+
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
260+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_U32_U24_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1
261+
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE]], 3, 0, implicit $exec
262+
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_USHORT]], 255, implicit $exec
263+
; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
264+
; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_MOV_B32_e32_2]], 0, [[GLOBAL_LOAD_USHORT]], 0, 6, 0, 6, 0, implicit $exec
265+
; CHECK-NEXT: S_CMP_EQ_U32 [[COPY2]].sub0, 0, implicit-def $scc
266+
; CHECK-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
267+
; CHECK-NEXT: $vcc_lo = COPY [[S_CSELECT_B32_1]]
268+
; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_AND_B32_sdwa]], implicit $vcc_lo, implicit $exec
269+
; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 24, implicit $exec
270+
; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_MOV_B32_e32_3]], 0, [[V_CNDMASK_B32_e32_]], 0, 1, 0, 6, 6, implicit $exec
271+
; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CNDMASK_B32_e32_]], implicit $exec
272+
; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec
273+
; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_CNDMASK_B32_e32_]], 0, [[V_MOV_B32_e32_4]], 0, 6, 0, 5, 6, implicit $exec
274+
; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa1]], 0, [[V_LSHRREV_B32_sdwa]], 0, 5, 0, 6, 6, implicit $exec
275+
; CHECK-NEXT: {{ $}}
276+
; CHECK-NEXT: bb.2:
277+
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_OR_B32_sdwa]], %bb.1
278+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
279+
; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], [[PHI]], 0, 0, implicit $exec
280+
; CHECK-NEXT: S_ENDPGM 0
281+
bb.0:
282+
successors: %bb.1(0x40000000), %bb.2(0x40000000)
283+
liveins: $vgpr0, $sgpr8_sgpr9
284+
285+
%0:sgpr_64 = COPY $sgpr8_sgpr9
286+
%1:vgpr_32 = COPY $vgpr0
287+
%2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0
288+
S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc
289+
%3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
290+
%4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
291+
%5:sreg_32 = S_AND_B32 $exec_lo, %3, implicit-def dead $scc
292+
$vcc_lo = COPY %5
293+
S_CBRANCH_VCCNZ %bb.2, implicit $vcc
294+
S_BRANCH %bb.1
295+
296+
bb.1:
297+
successors: %bb.2(0x80000000)
298+
299+
%6:sreg_64 = COPY %2
300+
%7:vgpr_32 = V_MUL_U32_U24_e64 %1(s32), 5, 0, implicit $exec
301+
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
302+
%9:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, killed %8, %subreg.sub1
303+
%10:vgpr_32 = GLOBAL_LOAD_USHORT %9, 3, 0, implicit $exec
304+
%11:vgpr_32 = V_AND_B32_e64 %10, 255, implicit $exec
305+
%12:vgpr_32 = V_AND_B32_e64 65535, killed %11, implicit $exec
306+
S_CMP_EQ_U32 %6.sub0, 0, implicit-def $scc
307+
%13:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
308+
%14:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %12, %13, implicit $exec
309+
%15:vgpr_32 = V_LSHRREV_B32_e64 24, %14, implicit $exec
310+
%16:vgpr_32 = V_LSHLREV_B16_e64 8, %15, implicit $exec
311+
%17:vgpr_32 = V_LSHRREV_B32_e64 16, %14, implicit $exec
312+
%18:vgpr_32 = V_AND_B32_e64 %17, 255, implicit $exec
313+
%19:vgpr_32 = V_OR_B32_e64 killed %18, killed %16, implicit $exec
314+
%20:vgpr_32 = V_LSHLREV_B32_e64 16, killed %19, implicit $exec
315+
316+
bb.2:
317+
%21:vgpr_32 = PHI %4, %bb.0, %20, %bb.1
318+
%22:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
319+
GLOBAL_STORE_BYTE killed %22, %21, 0, 0, implicit $exec
320+
S_ENDPGM 0
321+
...

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.