/Source LLVM IR Secret
Created
May 6, 2025 21:19
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save lialan/0cf658c174d02edef6aebc9c4d248c2e to your computer and use it in GitHub Desktop.
PR #137930 Triage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 5 | |
.text | |
.globl matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.p2align 8 | |
.type matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function | |
matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32: | |
s_load_dwordx2 s[2:3], s[0:1], 0x0 | |
s_load_dwordx8 s[4:11], s[0:1], 0x8 | |
s_load_dwordx4 s[12:15], s[0:1], 0x28 | |
s_waitcnt lgkmcnt(0) | |
s_branch .LBB0_0 | |
.p2align 8 | |
.LBB0_0: | |
s_mov_b64 s[28:29], s[2:3] | |
s_mul_i32 s2, s10, s9 | |
s_mul_hi_u32 s3, s10, s8 | |
s_add_i32 s2, s3, s2 | |
s_mul_i32 s3, s11, s8 | |
s_add_i32 s3, s2, s3 | |
s_mul_i32 s2, s10, s8 | |
v_mov_b64_e32 v[6:7], s[10:11] | |
v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7] | |
s_mov_b64 s[20:21], s[6:7] | |
s_and_b64 s[6:7], vcc, exec | |
s_load_dwordx4 s[24:27], s[0:1], 0x38 | |
s_cselect_b32 s30, s2, s10 | |
s_load_dwordx2 s[0:1], s[0:1], 0x38 | |
s_mul_i32 s2, s14, s13 | |
s_mul_hi_u32 s3, s14, s12 | |
v_mov_b32_e32 v3, s15 | |
s_add_i32 s2, s3, s2 | |
s_mul_i32 s15, s15, s12 | |
v_mov_b32_e32 v2, s14 | |
s_add_i32 s3, s2, s15 | |
s_mul_i32 s2, s14, s12 | |
v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] | |
s_and_b64 s[8:9], vcc, exec | |
s_cselect_b32 s6, s2, s14 | |
s_waitcnt lgkmcnt(0) | |
s_mul_i32 s1, s26, s1 | |
s_mul_hi_u32 s2, s26, s0 | |
s_add_i32 s1, s2, s1 | |
s_mul_i32 s2, s27, s0 | |
v_mov_b32_e32 v4, s26 | |
v_mov_b32_e32 v5, s27 | |
s_add_i32 s1, s1, s2 | |
s_mul_i32 s0, s26, s0 | |
v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] | |
s_and_b64 s[2:3], vcc, exec | |
s_cselect_b32 s0, s0, s26 | |
s_lshl_b32 s22, s0, 2 | |
s_and_b32 s21, s21, 0xffff | |
s_mov_b32 s7, 0x27000 | |
v_lshl_add_u32 v1, v0, 2, 0 | |
v_mov_b32_e32 v2, 0 | |
s_cmp_eq_u64 s[10:11], 0 | |
s_mov_b64 s[0:1], 0 | |
ds_write_b32 v1, v2 | |
s_cbranch_scc1 .LBB0_10 | |
s_and_b32 s29, s29, 0xffff | |
s_and_b32 s5, s5, 0xffff | |
s_mul_i32 s2, s17, s10 | |
s_lshl_b32 s12, s14, 6 | |
v_add_u32_e32 v4, s2, v0 | |
v_mul_lo_u32 v2, v0, s14 | |
s_add_u32 s2, 0, s10 | |
v_add_u32_e32 v5, s16, v2 | |
s_addc_u32 s3, s11, 0 | |
v_mov_b64_e32 v[2:3], s[10:11] | |
s_mov_b32 s31, s7 | |
s_branch .LBB0_3 | |
.LBB0_2: | |
s_or_b64 exec, exec, s[8:9] | |
s_add_u32 s0, s0, 64 | |
s_addc_u32 s1, s1, 0 | |
s_add_u32 s2, s2, 0xffffffc0 | |
v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] | |
v_add_u32_e32 v4, 64, v4 | |
v_add_u32_e32 v5, s12, v5 | |
s_addc_u32 s3, s3, -1 | |
s_waitcnt lgkmcnt(0) | |
s_cbranch_vccz .LBB0_9 | |
.LBB0_3: | |
v_cmp_lt_i64_e64 s[8:9], s[2:3], 64 | |
s_and_b64 s[8:9], s[8:9], exec | |
s_cselect_b32 s8, s2, 64 | |
s_waitcnt vmcnt(1) | |
v_sub_u32_e32 v7, s8, v0 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v6, 0 | |
v_cmp_lt_i32_e32 vcc, 0, v7 | |
v_mov_b32_e32 v7, 0 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execnz .LBB0_6 | |
s_or_b64 exec, exec, s[8:9] | |
buffer_load_ubyte v8, v4, s[28:31], 0 offen | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execnz .LBB0_7 | |
.LBB0_5: | |
s_or_b64 exec, exec, s[8:9] | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execz .LBB0_2 | |
s_branch .LBB0_8 | |
.LBB0_6: | |
buffer_load_sbyte v7, v5, s[4:7], 0 offen | |
s_or_b64 exec, exec, s[8:9] | |
buffer_load_ubyte v8, v4, s[28:31], 0 offen | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execz .LBB0_5 | |
.LBB0_7: | |
ds_read_b32 v6, v1 | |
s_or_b64 exec, exec, s[8:9] | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execz .LBB0_2 | |
.LBB0_8: | |
s_waitcnt vmcnt(0) | |
v_bfe_i32 v8, v8, 0, 8 | |
v_cndmask_b32_e32 v8, 0, v8, vcc | |
s_waitcnt lgkmcnt(0) | |
v_mad_i32_i24 v6, v8, v7, v6 | |
ds_write_b32 v1, v6 | |
s_branch .LBB0_2 | |
.LBB0_9: | |
ds_read_b32 v2, v1 | |
.LBB0_10: | |
v_mbcnt_lo_u32_b32 v1, -1, 0 | |
v_mbcnt_hi_u32_b32 v3, -1, v1 | |
v_and_b32_e32 v1, 64, v3 | |
v_add_u32_e32 v4, 64, v1 | |
v_xor_b32_e32 v1, 1, v3 | |
v_cmp_lt_i32_e32 vcc, v1, v4 | |
v_xor_b32_e32 v5, 2, v3 | |
s_nop 0 | |
v_cndmask_b32_e32 v1, v3, v1, vcc | |
v_lshlrev_b32_e32 v1, 2, v1 | |
s_waitcnt lgkmcnt(0) | |
ds_bpermute_b32 v1, v1, v2 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 4, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 8, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 16, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 32, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_cmp_eq_u32_e32 vcc, 0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_cbranch_execz .LBB0_12 | |
s_mul_i32 s0, s26, s17 | |
s_lshl_b32 s0, s0, 2 | |
s_lshl_b32 s1, s16, 2 | |
s_add_i32 s0, s0, s1 | |
s_mov_b32 s23, s7 | |
v_mov_b32_e32 v0, s0 | |
buffer_load_dword v3, v0, s[20:23], 0 offen | |
s_waitcnt vmcnt(0) lgkmcnt(0) | |
v_add3_u32 v1, v1, v2, v3 | |
buffer_store_dword v1, v0, s[20:23], 0 offen | |
.LBB0_12: | |
s_endpgm | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 | |
.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.amdhsa_group_segment_fixed_size 0 | |
.amdhsa_private_segment_fixed_size 0 | |
.amdhsa_kernarg_size 72 | |
.amdhsa_user_sgpr_count 16 | |
.amdhsa_user_sgpr_dispatch_ptr 0 | |
.amdhsa_user_sgpr_queue_ptr 0 | |
.amdhsa_user_sgpr_kernarg_segment_ptr 1 | |
.amdhsa_user_sgpr_dispatch_id 0 | |
.amdhsa_user_sgpr_kernarg_preload_length 14 | |
.amdhsa_user_sgpr_kernarg_preload_offset 0 | |
.amdhsa_user_sgpr_private_segment_size 0 | |
.amdhsa_uses_dynamic_stack 0 | |
.amdhsa_enable_private_segment 0 | |
.amdhsa_system_sgpr_workgroup_id_x 1 | |
.amdhsa_system_sgpr_workgroup_id_y 1 | |
.amdhsa_system_sgpr_workgroup_id_z 0 | |
.amdhsa_system_sgpr_workgroup_info 0 | |
.amdhsa_system_vgpr_workitem_id 0 | |
.amdhsa_next_free_vgpr 9 | |
.amdhsa_next_free_sgpr 32 | |
.amdhsa_accum_offset 12 | |
.amdhsa_reserve_vcc 1 | |
.amdhsa_reserve_xnack_mask 1 | |
.amdhsa_float_round_mode_32 0 | |
.amdhsa_float_round_mode_16_64 0 | |
.amdhsa_float_denorm_mode_32 3 | |
.amdhsa_float_denorm_mode_16_64 3 | |
.amdhsa_dx10_clamp 1 | |
.amdhsa_ieee_mode 1 | |
.amdhsa_fp16_overflow 0 | |
.amdhsa_tg_split 0 | |
.amdhsa_exception_fp_ieee_invalid_op 0 | |
.amdhsa_exception_fp_denorm_src 0 | |
.amdhsa_exception_fp_ieee_div_zero 0 | |
.amdhsa_exception_fp_ieee_overflow 0 | |
.amdhsa_exception_fp_ieee_underflow 0 | |
.amdhsa_exception_fp_ieee_inexact 0 | |
.amdhsa_exception_int_div_zero 0 | |
.end_amdhsa_kernel | |
.text | |
.Lfunc_end0: | |
.size matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 9 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0 | |
.p2alignl 6, 3212836864 | |
.fill 256, 4, 3212836864 | |
.section .AMDGPU.gpr_maximums,"",@progbits | |
.set amdgpu.max_num_vgpr, 0 | |
.set amdgpu.max_num_agpr, 0 | |
.set amdgpu.max_num_sgpr, 0 | |
.text | |
.section ".note.GNU-stack","",@progbits | |
.amdgpu_metadata | |
--- | |
amdhsa.kernels: | |
- .agpr_count: 0 | |
.args: | |
- .actual_access: read_only | |
.address_space: global | |
.offset: 0 | |
.size: 8 | |
.value_kind: global_buffer | |
- .actual_access: read_only | |
.address_space: global | |
.offset: 8 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 16 | |
.size: 8 | |
.value_kind: global_buffer | |
- .offset: 24 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 28 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 32 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 36 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 40 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 44 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 48 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 52 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 56 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 60 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 64 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 68 | |
.size: 4 | |
.value_kind: by_value | |
.group_segment_fixed_size: 0 | |
.kernarg_segment_align: 8 | |
.kernarg_segment_size: 72 | |
.max_flat_workgroup_size: 64 | |
.name: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.private_segment_fixed_size: 0 | |
.reqd_workgroup_size: | |
- 64 | |
- 1 | |
- 1 | |
.sgpr_count: 38 | |
.sgpr_spill_count: 0 | |
.symbol: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd | |
.uniform_work_group_size: 1 | |
.uses_dynamic_stack: false | |
.vgpr_count: 9 | |
.vgpr_spill_count: 0 | |
.wavefront_size: 64 | |
amdhsa.target: amdgcn-amd-amdhsa--gfx942 | |
amdhsa.version: | |
- 1 | |
- 2 | |
... | |
.end_amdgpu_metadata |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 5 | |
.text | |
.globl matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.p2align 8 | |
.type matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function | |
matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32: | |
s_load_dwordx2 s[2:3], s[0:1], 0x0 | |
s_load_dwordx8 s[4:11], s[0:1], 0x8 | |
s_load_dwordx4 s[12:15], s[0:1], 0x28 | |
s_waitcnt lgkmcnt(0) | |
s_branch .LBB0_0 | |
.p2align 8 | |
.LBB0_0: | |
s_mov_b64 s[28:29], s[2:3] | |
s_mul_i32 s2, s10, s9 | |
s_mul_hi_u32 s3, s10, s8 | |
s_add_i32 s2, s3, s2 | |
s_mul_i32 s3, s11, s8 | |
s_add_i32 s3, s2, s3 | |
s_mul_i32 s2, s10, s8 | |
v_mov_b64_e32 v[6:7], s[10:11] | |
v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7] | |
s_mov_b64 s[20:21], s[6:7] | |
s_and_b64 s[6:7], vcc, exec | |
s_load_dwordx4 s[24:27], s[0:1], 0x38 | |
s_cselect_b32 s30, s2, s10 | |
s_load_dwordx2 s[0:1], s[0:1], 0x38 | |
s_mul_i32 s2, s14, s13 | |
s_mul_hi_u32 s3, s14, s12 | |
v_mov_b32_e32 v3, s15 | |
s_add_i32 s2, s3, s2 | |
s_mul_i32 s15, s15, s12 | |
v_mov_b32_e32 v2, s14 | |
s_add_i32 s3, s2, s15 | |
s_mul_i32 s2, s14, s12 | |
v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] | |
s_and_b64 s[8:9], vcc, exec | |
s_cselect_b32 s6, s2, s14 | |
s_waitcnt lgkmcnt(0) | |
s_mul_i32 s1, s26, s1 | |
s_mul_hi_u32 s2, s26, s0 | |
s_add_i32 s1, s2, s1 | |
s_mul_i32 s2, s27, s0 | |
v_mov_b32_e32 v4, s26 | |
v_mov_b32_e32 v5, s27 | |
s_add_i32 s1, s1, s2 | |
s_mul_i32 s0, s26, s0 | |
v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] | |
s_and_b64 s[2:3], vcc, exec | |
s_cselect_b32 s0, s0, s26 | |
s_lshl_b32 s22, s0, 2 | |
s_and_b32 s21, s21, 0xffff | |
s_mov_b32 s7, 0x27000 | |
v_lshl_add_u32 v1, v0, 2, 0 | |
v_mov_b32_e32 v2, 0 | |
s_cmp_eq_u64 s[10:11], 0 | |
s_mov_b64 s[0:1], 0 | |
ds_write_b32 v1, v2 | |
s_cbranch_scc1 .LBB0_10 | |
s_and_b32 s29, s29, 0xffff | |
s_and_b32 s5, s5, 0xffff | |
s_mul_i32 s2, s17, s10 | |
s_lshl_b32 s12, s14, 6 | |
v_add_u32_e32 v4, s2, v0 | |
v_mul_lo_u32 v2, v0, s14 | |
s_add_u32 s2, 0, s10 | |
v_add_u32_e32 v5, s16, v2 | |
s_addc_u32 s3, s11, 0 | |
v_mov_b64_e32 v[2:3], s[10:11] | |
s_mov_b32 s31, s7 | |
v_mov_b32_e32 v6, 0 | |
s_branch .LBB0_3 | |
.LBB0_2: | |
s_or_b64 exec, exec, s[8:9] | |
s_add_u32 s0, s0, 64 | |
s_addc_u32 s1, s1, 0 | |
s_add_u32 s2, s2, 0xffffffc0 | |
v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] | |
v_add_u32_e32 v4, 64, v4 | |
v_add_u32_e32 v5, s12, v5 | |
s_addc_u32 s3, s3, -1 | |
s_waitcnt lgkmcnt(0) | |
s_cbranch_vccz .LBB0_9 | |
.LBB0_3: | |
v_cmp_lt_i64_e64 s[8:9], s[2:3], 64 | |
s_and_b64 s[8:9], s[8:9], exec | |
s_cselect_b32 s8, s2, 64 | |
s_waitcnt vmcnt(1) | |
v_sub_u32_e32 v7, s8, v0 | |
v_cmp_lt_i32_e32 vcc, 0, v7 | |
v_mov_b32_e32 v7, 0 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execz .LBB0_5 | |
buffer_load_sbyte v7, v5, s[4:7], 0 offen | |
.LBB0_5: | |
s_or_b64 exec, exec, s[8:9] | |
buffer_load_ubyte v8, v4, s[28:31], 0 offen | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v9, 0 | |
s_and_saveexec_b64 s[8:9], vcc | |
ds_read_b32 v9, v1 | |
s_or_b64 exec, exec, s[8:9] | |
s_and_saveexec_b64 s[8:9], vcc | |
s_cbranch_execz .LBB0_2 | |
s_waitcnt vmcnt(0) | |
v_cndmask_b32_sdwa v8, v6, -v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | |
s_waitcnt lgkmcnt(0) | |
v_mad_i32_i24 v7, v8, v7, v9 | |
ds_write_b32 v1, v7 | |
s_branch .LBB0_2 | |
.LBB0_9: | |
ds_read_b32 v2, v1 | |
.LBB0_10: | |
v_mbcnt_lo_u32_b32 v1, -1, 0 | |
v_mbcnt_hi_u32_b32 v3, -1, v1 | |
v_and_b32_e32 v1, 64, v3 | |
v_add_u32_e32 v4, 64, v1 | |
v_xor_b32_e32 v1, 1, v3 | |
v_cmp_lt_i32_e32 vcc, v1, v4 | |
v_xor_b32_e32 v5, 2, v3 | |
s_nop 0 | |
v_cndmask_b32_e32 v1, v3, v1, vcc | |
v_lshlrev_b32_e32 v1, 2, v1 | |
s_waitcnt lgkmcnt(0) | |
ds_bpermute_b32 v1, v1, v2 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 4, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 8, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 16, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_xor_b32_e32 v5, 32, v3 | |
v_cmp_lt_i32_e32 vcc, v5, v4 | |
s_waitcnt lgkmcnt(0) | |
v_add_u32_e32 v1, v1, v2 | |
v_cndmask_b32_e32 v2, v3, v5, vcc | |
v_lshlrev_b32_e32 v2, 2, v2 | |
ds_bpermute_b32 v2, v2, v1 | |
v_cmp_eq_u32_e32 vcc, 0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_cbranch_execz .LBB0_12 | |
s_mul_i32 s0, s26, s17 | |
s_lshl_b32 s0, s0, 2 | |
s_lshl_b32 s1, s16, 2 | |
s_add_i32 s0, s0, s1 | |
s_mov_b32 s23, s7 | |
v_mov_b32_e32 v0, s0 | |
buffer_load_dword v3, v0, s[20:23], 0 offen | |
s_waitcnt vmcnt(0) lgkmcnt(0) | |
v_add3_u32 v1, v1, v2, v3 | |
buffer_store_dword v1, v0, s[20:23], 0 offen | |
.LBB0_12: | |
s_endpgm | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 | |
.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.amdhsa_group_segment_fixed_size 0 | |
.amdhsa_private_segment_fixed_size 0 | |
.amdhsa_kernarg_size 72 | |
.amdhsa_user_sgpr_count 16 | |
.amdhsa_user_sgpr_dispatch_ptr 0 | |
.amdhsa_user_sgpr_queue_ptr 0 | |
.amdhsa_user_sgpr_kernarg_segment_ptr 1 | |
.amdhsa_user_sgpr_dispatch_id 0 | |
.amdhsa_user_sgpr_kernarg_preload_length 14 | |
.amdhsa_user_sgpr_kernarg_preload_offset 0 | |
.amdhsa_user_sgpr_private_segment_size 0 | |
.amdhsa_uses_dynamic_stack 0 | |
.amdhsa_enable_private_segment 0 | |
.amdhsa_system_sgpr_workgroup_id_x 1 | |
.amdhsa_system_sgpr_workgroup_id_y 1 | |
.amdhsa_system_sgpr_workgroup_id_z 0 | |
.amdhsa_system_sgpr_workgroup_info 0 | |
.amdhsa_system_vgpr_workitem_id 0 | |
.amdhsa_next_free_vgpr 10 | |
.amdhsa_next_free_sgpr 32 | |
.amdhsa_accum_offset 12 | |
.amdhsa_reserve_vcc 1 | |
.amdhsa_reserve_xnack_mask 1 | |
.amdhsa_float_round_mode_32 0 | |
.amdhsa_float_round_mode_16_64 0 | |
.amdhsa_float_denorm_mode_32 3 | |
.amdhsa_float_denorm_mode_16_64 3 | |
.amdhsa_dx10_clamp 1 | |
.amdhsa_ieee_mode 1 | |
.amdhsa_fp16_overflow 0 | |
.amdhsa_tg_split 0 | |
.amdhsa_exception_fp_ieee_invalid_op 0 | |
.amdhsa_exception_fp_denorm_src 0 | |
.amdhsa_exception_fp_ieee_div_zero 0 | |
.amdhsa_exception_fp_ieee_overflow 0 | |
.amdhsa_exception_fp_ieee_underflow 0 | |
.amdhsa_exception_fp_ieee_inexact 0 | |
.amdhsa_exception_int_div_zero 0 | |
.end_amdhsa_kernel | |
.text | |
.Lfunc_end0: | |
.size matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 10 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0 | |
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0 | |
.p2alignl 6, 3212836864 | |
.fill 256, 4, 3212836864 | |
.section .AMDGPU.gpr_maximums,"",@progbits | |
.set amdgpu.max_num_vgpr, 0 | |
.set amdgpu.max_num_agpr, 0 | |
.set amdgpu.max_num_sgpr, 0 | |
.text | |
.section ".note.GNU-stack","",@progbits | |
.amdgpu_metadata | |
--- | |
amdhsa.kernels: | |
- .agpr_count: 0 | |
.args: | |
- .actual_access: read_only | |
.address_space: global | |
.offset: 0 | |
.size: 8 | |
.value_kind: global_buffer | |
- .actual_access: read_only | |
.address_space: global | |
.offset: 8 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 16 | |
.size: 8 | |
.value_kind: global_buffer | |
- .offset: 24 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 28 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 32 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 36 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 40 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 44 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 48 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 52 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 56 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 60 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 64 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 68 | |
.size: 4 | |
.value_kind: by_value | |
.group_segment_fixed_size: 0 | |
.kernarg_segment_align: 8 | |
.kernarg_segment_size: 72 | |
.max_flat_workgroup_size: 64 | |
.name: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32 | |
.private_segment_fixed_size: 0 | |
.reqd_workgroup_size: | |
- 64 | |
- 1 | |
- 1 | |
.sgpr_count: 38 | |
.sgpr_spill_count: 0 | |
.symbol: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd | |
.uniform_work_group_size: 1 | |
.uses_dynamic_stack: false | |
.vgpr_count: 10 | |
.vgpr_spill_count: 0 | |
.wavefront_size: 64 | |
amdhsa.target: amdgcn-amd-amdhsa--gfx942 | |
amdhsa.version: | |
- 1 | |
- 2 | |
... | |
.end_amdgpu_metadata |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; To reproduce the .optimized.ll from the .linked.ll, run: | |
; opt --passes='verify,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>),openmp-opt,amdgpu-printf-runtime-binding,amdgpu-unify-metadata,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>),always-inline,require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline,function-attrs<skip-non-recursive-function-attrs>,openmp-opt-cgscc,function(amdgpu-promote-kernel-arguments,infer-address-spaces,amdgpu-lower-kernel-attributes,amdgpu-promote-alloca-to-vector),function<eager-inv;no-rerun>(sroa<modify-cfg>,early-cse<memssa>,speculative-execution<only-if-divergent-target>,jump-threading,correlated-propagation,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine,libcalls-shrinkwrap,amdgpu-usenative,amdgpu-simplifylib,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,reassociate,constraint-elimination,loop-mssa(loop-instsimplify,loop-simplifycfg,licm<no-allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>,licm<allowspeculation>,simple-loop-unswitch<no-nontrivial;trivial>),simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,loop(loop-idiom,indvars,extra-simple-loop-unswitch-passes,loop-deletion,loop-unroll-full),sroa<modify-cfg>,vector-combine,mldst-motion<no-split-footer-bb>,gvn<>,sccp,bdce,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,jump-threading,correlated-propagation,adce,memcpyopt,dse,move-auto-init,loop-mssa(licm<allowspeculation>),coro-elide,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib),function-attrs,function(require<should-not-run-function-passes>),coro-split,coro-annotation-elide)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,function<eager-inv>(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion),loop-distribute,inject-tli-mappings,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-load-elim,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,slp-vectorizer,vector-combine,instcombine<max-iterations=1;no-verify-fixpoint>,loop-unroll<O2>,transform-warning,sroa<preserve-cfg>,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,loop-sink,instsimplify,div-rem-pairs,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),amdgpu-attributor,globaldce,constmerge,cg-profile,rel-lookup-table-converter,function(annotation-remarks),verify' | |
; ModuleID = 'matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0' | |
source_filename = "matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0" | |
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" | |
@__dynamic_shared_memory__ = external addrspace(3) global [0 x i8], align 16 | |
; Function Attrs: alwaysinline nofree norecurse nounwind | |
define amdgpu_kernel void @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32(ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %0, ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %1, ptr addrspace(1) inreg noalias noundef nonnull align 16 %2, i32 inreg noundef %3, i32 inreg noundef %4, i32 inreg noundef %5, i32 inreg noundef %6, i32 inreg noundef %7, i32 inreg noundef %8, i32 inreg noundef %9, i32 inreg noundef %10, i32 inreg noundef %11, i32 inreg noundef %12, i32 inreg noundef %13, i32 inreg noundef %14) local_unnamed_addr #0 !reqd_work_group_size !2 { | |
%16 = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x() | |
%17 = zext nneg i32 %16 to i64 | |
%18 = zext i32 %3 to i64 | |
%19 = zext i32 %4 to i64 | |
%20 = shl nuw i64 %19, 32 | |
%21 = or disjoint i64 %20, %18 | |
%22 = zext i32 %5 to i64 | |
%23 = zext i32 %6 to i64 | |
%24 = shl nuw i64 %23, 32 | |
%25 = or disjoint i64 %24, %22 | |
%26 = zext i32 %7 to i64 | |
%27 = zext i32 %8 to i64 | |
%28 = shl nuw i64 %27, 32 | |
%29 = or disjoint i64 %28, %26 | |
%30 = zext i32 %9 to i64 | |
%31 = zext i32 %10 to i64 | |
%32 = shl nuw i64 %31, 32 | |
%33 = or disjoint i64 %32, %30 | |
%34 = zext i32 %11 to i64 | |
%35 = zext i32 %12 to i64 | |
%36 = shl nuw i64 %35, 32 | |
%37 = or disjoint i64 %36, %34 | |
%38 = zext i32 %13 to i64 | |
%39 = zext nneg i32 %14 to i64 | |
%40 = shl nuw nsw i64 %39, 32 | |
%41 = or disjoint i64 %40, %38 | |
%42 = icmp ult i32 %4, 2097152 | |
tail call void @llvm.assume(i1 %42) | |
%43 = icmp ult i32 %6, 2097152 | |
tail call void @llvm.assume(i1 %43) | |
%44 = icmp ult i32 %8, 2097152 | |
tail call void @llvm.assume(i1 %44) | |
%45 = icmp ult i32 %10, 2097152 | |
tail call void @llvm.assume(i1 %45) | |
%46 = icmp ult i32 %12, 2097152 | |
tail call void @llvm.assume(i1 %46) | |
%47 = icmp ult i32 %14, 2097152 | |
tail call void @llvm.assume(i1 %47) | |
%48 = mul i64 %25, %21 | |
%49 = tail call i64 @llvm.umax.i64(i64 %48, i64 %25) | |
%50 = trunc i64 %49 to i32 | |
%51 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %0, i16 0, i32 %50, i32 159744) | |
call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %51, i64 64) ] | |
%52 = mul i64 %33, %29 | |
%53 = tail call i64 @llvm.umax.i64(i64 %52, i64 %33) | |
%54 = trunc i64 %53 to i32 | |
%55 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %1, i16 0, i32 %54, i32 159744) | |
call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %55, i64 64) ] | |
%56 = mul i64 %41, %37 | |
%57 = tail call i64 @llvm.umax.i64(i64 %56, i64 %41) | |
%58 = trunc i64 %57 to i32 | |
%59 = shl i32 %58, 2 | |
%60 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %2, i16 0, i32 %59, i32 159744) | |
call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %60, i64 64) ] | |
%61 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.x() | |
%62 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.y() | |
%63 = getelementptr i32, ptr addrspace(3) @__dynamic_shared_memory__, i32 %16 | |
store <1 x i32> zeroinitializer, ptr addrspace(3) %63, align 4 | |
%invariant.gep = getelementptr i8, ptr addrspace(7) %55, i32 %61 | |
%.not = icmp eq i64 %25, 0 | |
br i1 %.not, label %._crit_edge, label %.lr.ph | |
.lr.ph: ; preds = %15 | |
%64 = zext nneg i32 %62 to i64 | |
%65 = mul nuw nsw i64 %22, %64 | |
br label %66 | |
66: ; preds = %.lr.ph, %82 | |
%67 = phi i64 [ 0, %.lr.ph ], [ %93, %82 ] | |
%68 = sub nuw nsw i64 %25, %67 | |
%69 = tail call i64 @llvm.smin.i64(i64 %68, i64 64) | |
%70 = trunc nuw nsw i64 %69 to i32 | |
%71 = sub i32 %70, %16 | |
%72 = insertelement <1 x i32> poison, i32 %71, i64 0 | |
%73 = icmp sgt <1 x i32> %72, zeroinitializer | |
%74 = or disjoint i64 %67, %17 | |
%75 = extractelement <1 x i1> %73, i64 0 | |
br i1 %75, label %76, label %82 | |
76: ; preds = %66 | |
%77 = trunc i64 %74 to i32 | |
%78 = mul i32 %9, %77 | |
%gep = getelementptr i8, ptr addrspace(7) %invariant.gep, i32 %78 | |
%79 = load i8, ptr addrspace(7) %gep, align 1 | |
%80 = insertelement <1 x i8> poison, i8 %79, i64 0 | |
%81 = sext <1 x i8> %80 to <1 x i32> | |
br label %82 | |
82: ; preds = %66, %76 | |
%83 = phi <1 x i32> [ %81, %76 ], [ zeroinitializer, %66 ] | |
%84 = add nuw i64 %74, %65 | |
%85 = trunc i64 %84 to i32 | |
%86 = getelementptr i8, ptr addrspace(7) %51, i32 %85 | |
%87 = load <1 x i8>, ptr addrspace(7) %86, align 1 | |
%88 = sext <1 x i8> %87 to <1 x i32> | |
%89 = select <1 x i1> %73, <1 x i32> %88, <1 x i32> zeroinitializer | |
%90 = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) %63, i32 4, <1 x i1> %73, <1 x i32> zeroinitializer) | |
%91 = mul nsw <1 x i32> %89, %83 | |
%92 = add <1 x i32> %91, %90 | |
tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %92, ptr addrspace(3) %63, i32 4, <1 x i1> %73) | |
tail call void @llvm.amdgcn.s.waitcnt(i32 -7937) | |
tail call void @llvm.amdgcn.s.barrier() | |
%93 = add nuw nsw i64 %67, 64 | |
%94 = icmp samesign ult i64 %93, %25 | |
br i1 %94, label %66, label %._crit_edge.loopexit | |
._crit_edge.loopexit: ; preds = %82 | |
%.pre = load <1 x i32>, ptr addrspace(3) %63, align 4 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %15 | |
%95 = phi <1 x i32> [ %.pre, %._crit_edge.loopexit ], [ zeroinitializer, %15 ] | |
%96 = extractelement <1 x i32> %95, i64 0 | |
%97 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) | |
%98 = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %97) | |
%99 = and i32 %98, -64 | |
%100 = add i32 %99, 64 | |
%101 = xor i32 %98, 1 | |
%102 = icmp slt i32 %101, %100 | |
%103 = select i1 %102, i32 %101, i32 %98 | |
%104 = shl i32 %103, 2 | |
%105 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %104, i32 %96) | |
%106 = add i32 %105, %96 | |
%107 = xor i32 %98, 2 | |
%108 = icmp slt i32 %107, %100 | |
%109 = select i1 %108, i32 %107, i32 %98 | |
%110 = shl i32 %109, 2 | |
%111 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %110, i32 %106) | |
%112 = add i32 %106, %111 | |
%113 = xor i32 %98, 4 | |
%114 = icmp slt i32 %113, %100 | |
%115 = select i1 %114, i32 %113, i32 %98 | |
%116 = shl i32 %115, 2 | |
%117 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %116, i32 %112) | |
%118 = add i32 %112, %117 | |
%119 = xor i32 %98, 8 | |
%120 = icmp slt i32 %119, %100 | |
%121 = select i1 %120, i32 %119, i32 %98 | |
%122 = shl i32 %121, 2 | |
%123 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %122, i32 %118) | |
%124 = add i32 %118, %123 | |
%125 = xor i32 %98, 16 | |
%126 = icmp slt i32 %125, %100 | |
%127 = select i1 %126, i32 %125, i32 %98 | |
%128 = shl i32 %127, 2 | |
%129 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %128, i32 %124) | |
%130 = add i32 %124, %129 | |
%131 = xor i32 %98, 32 | |
%132 = icmp slt i32 %131, %100 | |
%133 = select i1 %132, i32 %131, i32 %98 | |
%134 = shl i32 %133, 2 | |
%135 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %134, i32 %130) | |
%136 = icmp eq i32 %16, 0 | |
br i1 %136, label %137, label %146 | |
137: ; preds = %._crit_edge | |
%138 = add i32 %130, %135 | |
%139 = mul i32 %13, %62 | |
%140 = getelementptr i32, ptr addrspace(7) %60, i32 %139 | |
%141 = getelementptr i32, ptr addrspace(7) %140, i32 %61 | |
%142 = load <1 x i32>, ptr addrspace(7) %141, align 4 | |
%143 = extractelement <1 x i32> %142, i64 0 | |
%144 = add i32 %138, %143 | |
%145 = insertelement <1 x i32> poison, i32 %144, i64 0 | |
store <1 x i32> %145, ptr addrspace(7) %141, align 4 | |
br label %146 | |
146: ; preds = %137, %._crit_edge | |
ret void | |
} | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) | |
declare void @llvm.assume(i1 noundef) #2 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare i64 @llvm.umax.i64(i64, i64) #1 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) readnone, i16, i32, i32) #1 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare noundef i32 @llvm.amdgcn.workgroup.id.y() #1 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none) | |
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none) | |
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 | |
; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none) | |
declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #4 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare i64 @llvm.smin.i64(i64, i64) #1 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) | |
declare <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) captures(none), i32 immarg, <1 x i1>, <1 x i32>) #5 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) | |
declare void @llvm.masked.store.v1i32.p3(<1 x i32>, ptr addrspace(3) captures(none), i32 immarg, <1 x i1>) #6 | |
; Function Attrs: alwaysinline mustprogress nocallback nofree nounwind willreturn | |
declare void @llvm.amdgcn.s.waitcnt(i32 immarg) #7 | |
; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn | |
declare void @llvm.amdgcn.s.barrier() #8 | |
attributes #0 = { alwaysinline nofree norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } | |
attributes #1 = { alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } | |
attributes #2 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } | |
attributes #3 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none) } | |
attributes #4 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none) } | |
attributes #5 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } | |
attributes #6 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) } | |
attributes #7 = { alwaysinline mustprogress nocallback nofree nounwind willreturn } | |
attributes #8 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn } | |
!llvm.module.flags = !{!0, !1} | |
!0 = !{i32 2, !"Debug Info Version", i32 3} | |
!1 = !{i32 1, !"amdhsa_code_object_version", i32 500} | |
!2 = !{i32 64, i32 1, i32 1} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment