Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit cc98b35

Browse filesBrowse files
authored
[AMDGPU] Masked load vectortype test (#129703)
1 parent 31845cf commit cc98b35
Copy full SHA for cc98b35

File tree

Expand file treeCollapse file tree

1 file changed

+255
-0
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+255
-0
lines changed
+255Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s
3+
4+
define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
5+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v2i32:
6+
; GFX942: ; %bb.0: ; %entry
7+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
9+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
10+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
11+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
12+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
13+
; GFX942-NEXT: s_cbranch_execz .LBB0_2
14+
; GFX942-NEXT: ; %bb.1: ; %cond.load
15+
; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
16+
; GFX942-NEXT: .LBB0_2:
17+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
18+
; GFX942-NEXT: s_waitcnt vmcnt(0)
19+
; GFX942-NEXT: s_setpc_b64 s[30:31]
20+
entry:
21+
%partialmaskvec = insertelement <2 x i1> poison, i1 %mask, i64 0
22+
%maskvec = shufflevector <2 x i1> %partialmaskvec, <2 x i1> poison, <2 x i32> zeroinitializer
23+
%result = tail call <2 x i32> @llvm.masked.load.v2i32.p1(ptr addrspace(1) %ptr, i32 2, <2 x i1> %maskvec, <2 x i32> zeroinitializer)
24+
ret <2 x i32> %result
25+
}
26+
27+
define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
28+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4i32:
29+
; GFX942: ; %bb.0: ; %entry
30+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
32+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
33+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
34+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
35+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
36+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
37+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
38+
; GFX942-NEXT: s_cbranch_execz .LBB1_2
39+
; GFX942-NEXT: ; %bb.1: ; %cond.load
40+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
41+
; GFX942-NEXT: .LBB1_2:
42+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
43+
; GFX942-NEXT: s_waitcnt vmcnt(0)
44+
; GFX942-NEXT: s_setpc_b64 s[30:31]
45+
entry:
46+
%partialmaskvec = insertelement <4 x i1> poison, i1 %mask, i64 0
47+
%maskvec = shufflevector <4 x i1> %partialmaskvec, <4 x i1> poison, <4 x i32> zeroinitializer
48+
%result = tail call <4 x i32> @llvm.masked.load.v4i32.p1(ptr addrspace(1) %ptr, i32 4, <4 x i1> %maskvec, <4 x i32> zeroinitializer)
49+
ret <4 x i32> %result
50+
}
51+
52+
define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
53+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4f32:
54+
; GFX942: ; %bb.0: ; %entry
55+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
57+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
58+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
59+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
60+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
61+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
62+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
63+
; GFX942-NEXT: s_cbranch_execz .LBB2_2
64+
; GFX942-NEXT: ; %bb.1: ; %cond.load
65+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
66+
; GFX942-NEXT: .LBB2_2:
67+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
68+
; GFX942-NEXT: s_waitcnt vmcnt(0)
69+
; GFX942-NEXT: s_setpc_b64 s[30:31]
70+
entry:
71+
%partialmaskvec = insertelement <4 x i1> poison, i1 %mask, i64 0
72+
%maskvec = shufflevector <4 x i1> %partialmaskvec, <4 x i1> poison, <4 x i32> zeroinitializer
73+
%result = tail call <4 x float> @llvm.masked.load.v4f32.p1(ptr addrspace(1) %ptr, i32 4, <4 x i1> %maskvec, <4 x float> zeroinitializer)
74+
ret <4 x float> %result
75+
}
76+
77+
define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
78+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8i32:
79+
; GFX942: ; %bb.0: ; %entry
80+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
82+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
83+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
84+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
85+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
86+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
87+
; GFX942-NEXT: v_mov_b32_e32 v4, v0
88+
; GFX942-NEXT: v_mov_b32_e32 v5, v0
89+
; GFX942-NEXT: v_mov_b32_e32 v6, v0
90+
; GFX942-NEXT: v_mov_b32_e32 v7, v0
91+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
92+
; GFX942-NEXT: s_cbranch_execz .LBB3_2
93+
; GFX942-NEXT: ; %bb.1: ; %cond.load
94+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
95+
; GFX942-NEXT: s_nop 0
96+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
97+
; GFX942-NEXT: .LBB3_2:
98+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
99+
; GFX942-NEXT: s_waitcnt vmcnt(0)
100+
; GFX942-NEXT: s_setpc_b64 s[30:31]
101+
entry:
102+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i64 0
103+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
104+
%result = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x i32> zeroinitializer)
105+
ret <8 x i32> %result
106+
}
107+
108+
define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
109+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8f32:
110+
; GFX942: ; %bb.0: ; %entry
111+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
113+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
114+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
115+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
116+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
117+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
118+
; GFX942-NEXT: v_mov_b32_e32 v4, v0
119+
; GFX942-NEXT: v_mov_b32_e32 v5, v0
120+
; GFX942-NEXT: v_mov_b32_e32 v6, v0
121+
; GFX942-NEXT: v_mov_b32_e32 v7, v0
122+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
123+
; GFX942-NEXT: s_cbranch_execz .LBB4_2
124+
; GFX942-NEXT: ; %bb.1: ; %cond.load
125+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
126+
; GFX942-NEXT: s_nop 0
127+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
128+
; GFX942-NEXT: .LBB4_2:
129+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
130+
; GFX942-NEXT: s_waitcnt vmcnt(0)
131+
; GFX942-NEXT: s_setpc_b64 s[30:31]
132+
entry:
133+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i64 0
134+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
135+
%result = tail call <8 x float> @llvm.masked.load.v8f32.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x float> zeroinitializer)
136+
ret <8 x float> %result
137+
}
138+
139+
define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
140+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8i16:
141+
; GFX942: ; %bb.0: ; %entry
142+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
144+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
145+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
146+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
147+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
148+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
149+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
150+
; GFX942-NEXT: s_cbranch_execz .LBB5_2
151+
; GFX942-NEXT: ; %bb.1: ; %cond.load
152+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
153+
; GFX942-NEXT: .LBB5_2:
154+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
155+
; GFX942-NEXT: s_waitcnt vmcnt(0)
156+
; GFX942-NEXT: s_setpc_b64 s[30:31]
157+
entry:
158+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i16 0
159+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
160+
%result = tail call <8 x i16> @llvm.masked.load.v8i16.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x i16> zeroinitializer)
161+
ret <8 x i16> %result
162+
}
163+
164+
define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
165+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8f16:
166+
; GFX942: ; %bb.0: ; %entry
167+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
169+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
170+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
171+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
172+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
173+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
174+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
175+
; GFX942-NEXT: s_cbranch_execz .LBB6_2
176+
; GFX942-NEXT: ; %bb.1: ; %cond.load
177+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
178+
; GFX942-NEXT: .LBB6_2:
179+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
180+
; GFX942-NEXT: s_waitcnt vmcnt(0)
181+
; GFX942-NEXT: s_setpc_b64 s[30:31]
182+
entry:
183+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i16 0
184+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
185+
%result = tail call <8 x half> @llvm.masked.load.v8f16.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x half> zeroinitializer)
186+
ret <8 x half> %result
187+
}
188+
189+
define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
190+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8bf16:
191+
; GFX942: ; %bb.0: ; %entry
192+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
194+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
195+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
196+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
197+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
198+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
199+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
200+
; GFX942-NEXT: s_cbranch_execz .LBB7_2
201+
; GFX942-NEXT: ; %bb.1: ; %cond.load
202+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
203+
; GFX942-NEXT: .LBB7_2:
204+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
205+
; GFX942-NEXT: s_waitcnt vmcnt(0)
206+
; GFX942-NEXT: s_setpc_b64 s[30:31]
207+
entry:
208+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i32 0
209+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
210+
%result = tail call <8 x bfloat> @llvm.masked.load.v8bf16.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x bfloat> zeroinitializer)
211+
ret <8 x bfloat> %result
212+
}
213+
214+
define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
215+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v16i8:
216+
; GFX942: ; %bb.0: ; %entry
217+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
219+
; GFX942-NEXT: v_mov_b32_e32 v16, 0
220+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
221+
; GFX942-NEXT: v_mov_b32_e32 v17, v16
222+
; GFX942-NEXT: v_mov_b32_e32 v18, v16
223+
; GFX942-NEXT: v_mov_b32_e32 v19, v16
224+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
225+
; GFX942-NEXT: s_cbranch_execz .LBB8_2
226+
; GFX942-NEXT: ; %bb.1: ; %cond.load
227+
; GFX942-NEXT: global_load_dwordx4 v[16:19], v16, s[0:1]
228+
; GFX942-NEXT: .LBB8_2:
229+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
230+
; GFX942-NEXT: s_waitcnt vmcnt(0)
231+
; GFX942-NEXT: v_lshrrev_b64 v[20:21], 24, v[16:17]
232+
; GFX942-NEXT: v_lshrrev_b64 v[22:23], 24, v[18:19]
233+
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 8, v16
234+
; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v16
235+
; GFX942-NEXT: v_lshrrev_b32_e32 v5, 8, v17
236+
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v17
237+
; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v17
238+
; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v18
239+
; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v18
240+
; GFX942-NEXT: v_lshrrev_b32_e32 v13, 8, v19
241+
; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v19
242+
; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v19
243+
; GFX942-NEXT: v_mov_b32_e32 v0, v16
244+
; GFX942-NEXT: v_mov_b32_e32 v3, v20
245+
; GFX942-NEXT: v_mov_b32_e32 v4, v17
246+
; GFX942-NEXT: v_mov_b32_e32 v8, v18
247+
; GFX942-NEXT: v_mov_b32_e32 v11, v22
248+
; GFX942-NEXT: v_mov_b32_e32 v12, v19
249+
; GFX942-NEXT: s_setpc_b64 s[30:31]
250+
entry:
251+
%partialmaskvec = insertelement <16 x i1> poison, i1 %mask, i32 0
252+
%maskvec = shufflevector <16 x i1> %partialmaskvec, <16 x i1> poison, <16 x i32> zeroinitializer
253+
%result = tail call <16 x i8> @llvm.masked.load.v16i8.p1(ptr addrspace(1) %ptr, i32 4, <16 x i1> %maskvec, <16 x i8> zeroinitializer)
254+
ret <16 x i8> %result
255+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.