Skip to content

Instantly share code, notes, and snippets.

@lialan
Created May 6, 2025 21:19
Show Gist options
    • Learn more about clone URLs
  • Save lialan/0cf658c174d02edef6aebc9c4d248c2e to your computer and use it in GitHub Desktop.
  • Learn more about clone URLs
Save lialan/0cf658c174d02edef6aebc9c4d248c2e to your computer and use it in GitHub Desktop.
PR #137930 Triage
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.p2align 8
.type matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function
matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_load_dwordx8 s[4:11], s[0:1], 0x8
s_load_dwordx4 s[12:15], s[0:1], 0x28
s_waitcnt lgkmcnt(0)
s_branch .LBB0_0
.p2align 8
.LBB0_0:
s_mov_b64 s[28:29], s[2:3]
s_mul_i32 s2, s10, s9
s_mul_hi_u32 s3, s10, s8
s_add_i32 s2, s3, s2
s_mul_i32 s3, s11, s8
s_add_i32 s3, s2, s3
s_mul_i32 s2, s10, s8
v_mov_b64_e32 v[6:7], s[10:11]
v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7]
s_mov_b64 s[20:21], s[6:7]
s_and_b64 s[6:7], vcc, exec
s_load_dwordx4 s[24:27], s[0:1], 0x38
s_cselect_b32 s30, s2, s10
s_load_dwordx2 s[0:1], s[0:1], 0x38
s_mul_i32 s2, s14, s13
s_mul_hi_u32 s3, s14, s12
v_mov_b32_e32 v3, s15
s_add_i32 s2, s3, s2
s_mul_i32 s15, s15, s12
v_mov_b32_e32 v2, s14
s_add_i32 s3, s2, s15
s_mul_i32 s2, s14, s12
v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
s_and_b64 s[8:9], vcc, exec
s_cselect_b32 s6, s2, s14
s_waitcnt lgkmcnt(0)
s_mul_i32 s1, s26, s1
s_mul_hi_u32 s2, s26, s0
s_add_i32 s1, s2, s1
s_mul_i32 s2, s27, s0
v_mov_b32_e32 v4, s26
v_mov_b32_e32 v5, s27
s_add_i32 s1, s1, s2
s_mul_i32 s0, s26, s0
v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
s_and_b64 s[2:3], vcc, exec
s_cselect_b32 s0, s0, s26
s_lshl_b32 s22, s0, 2
s_and_b32 s21, s21, 0xffff
s_mov_b32 s7, 0x27000
v_lshl_add_u32 v1, v0, 2, 0
v_mov_b32_e32 v2, 0
s_cmp_eq_u64 s[10:11], 0
s_mov_b64 s[0:1], 0
ds_write_b32 v1, v2
s_cbranch_scc1 .LBB0_10
s_and_b32 s29, s29, 0xffff
s_and_b32 s5, s5, 0xffff
s_mul_i32 s2, s17, s10
s_lshl_b32 s12, s14, 6
v_add_u32_e32 v4, s2, v0
v_mul_lo_u32 v2, v0, s14
s_add_u32 s2, 0, s10
v_add_u32_e32 v5, s16, v2
s_addc_u32 s3, s11, 0
v_mov_b64_e32 v[2:3], s[10:11]
s_mov_b32 s31, s7
s_branch .LBB0_3
.LBB0_2:
s_or_b64 exec, exec, s[8:9]
s_add_u32 s0, s0, 64
s_addc_u32 s1, s1, 0
s_add_u32 s2, s2, 0xffffffc0
v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
v_add_u32_e32 v4, 64, v4
v_add_u32_e32 v5, s12, v5
s_addc_u32 s3, s3, -1
s_waitcnt lgkmcnt(0)
s_cbranch_vccz .LBB0_9
.LBB0_3:
v_cmp_lt_i64_e64 s[8:9], s[2:3], 64
s_and_b64 s[8:9], s[8:9], exec
s_cselect_b32 s8, s2, 64
s_waitcnt vmcnt(1)
v_sub_u32_e32 v7, s8, v0
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v6, 0
v_cmp_lt_i32_e32 vcc, 0, v7
v_mov_b32_e32 v7, 0
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execnz .LBB0_6
s_or_b64 exec, exec, s[8:9]
buffer_load_ubyte v8, v4, s[28:31], 0 offen
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execnz .LBB0_7
.LBB0_5:
s_or_b64 exec, exec, s[8:9]
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execz .LBB0_2
s_branch .LBB0_8
.LBB0_6:
buffer_load_sbyte v7, v5, s[4:7], 0 offen
s_or_b64 exec, exec, s[8:9]
buffer_load_ubyte v8, v4, s[28:31], 0 offen
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execz .LBB0_5
.LBB0_7:
ds_read_b32 v6, v1
s_or_b64 exec, exec, s[8:9]
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execz .LBB0_2
.LBB0_8:
s_waitcnt vmcnt(0)
v_bfe_i32 v8, v8, 0, 8
v_cndmask_b32_e32 v8, 0, v8, vcc
s_waitcnt lgkmcnt(0)
v_mad_i32_i24 v6, v8, v7, v6
ds_write_b32 v1, v6
s_branch .LBB0_2
.LBB0_9:
ds_read_b32 v2, v1
.LBB0_10:
v_mbcnt_lo_u32_b32 v1, -1, 0
v_mbcnt_hi_u32_b32 v3, -1, v1
v_and_b32_e32 v1, 64, v3
v_add_u32_e32 v4, 64, v1
v_xor_b32_e32 v1, 1, v3
v_cmp_lt_i32_e32 vcc, v1, v4
v_xor_b32_e32 v5, 2, v3
s_nop 0
v_cndmask_b32_e32 v1, v3, v1, vcc
v_lshlrev_b32_e32 v1, 2, v1
s_waitcnt lgkmcnt(0)
ds_bpermute_b32 v1, v1, v2
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 4, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 8, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 16, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 32, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_cmp_eq_u32_e32 vcc, 0, v0
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_12
s_mul_i32 s0, s26, s17
s_lshl_b32 s0, s0, 2
s_lshl_b32 s1, s16, 2
s_add_i32 s0, s0, s1
s_mov_b32 s23, s7
v_mov_b32_e32 v0, s0
buffer_load_dword v3, v0, s[20:23], 0 offen
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add3_u32 v1, v1, v2, v3
buffer_store_dword v1, v0, s[20:23], 0 offen
.LBB0_12:
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 72
.amdhsa_user_sgpr_count 16
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 14
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 9
.amdhsa_next_free_sgpr 32
.amdhsa_accum_offset 12
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 9
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .actual_access: read_only
.address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .actual_access: read_only
.address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 16
.size: 8
.value_kind: global_buffer
- .offset: 24
.size: 4
.value_kind: by_value
- .offset: 28
.size: 4
.value_kind: by_value
- .offset: 32
.size: 4
.value_kind: by_value
- .offset: 36
.size: 4
.value_kind: by_value
- .offset: 40
.size: 4
.value_kind: by_value
- .offset: 44
.size: 4
.value_kind: by_value
- .offset: 48
.size: 4
.value_kind: by_value
- .offset: 52
.size: 4
.value_kind: by_value
- .offset: 56
.size: 4
.value_kind: by_value
- .offset: 60
.size: 4
.value_kind: by_value
- .offset: 64
.size: 4
.value_kind: by_value
- .offset: 68
.size: 4
.value_kind: by_value
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 72
.max_flat_workgroup_size: 64
.name: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.private_segment_fixed_size: 0
.reqd_workgroup_size:
- 64
- 1
- 1
.sgpr_count: 38
.sgpr_spill_count: 0
.symbol: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.p2align 8
.type matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function
matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_load_dwordx8 s[4:11], s[0:1], 0x8
s_load_dwordx4 s[12:15], s[0:1], 0x28
s_waitcnt lgkmcnt(0)
s_branch .LBB0_0
.p2align 8
.LBB0_0:
s_mov_b64 s[28:29], s[2:3]
s_mul_i32 s2, s10, s9
s_mul_hi_u32 s3, s10, s8
s_add_i32 s2, s3, s2
s_mul_i32 s3, s11, s8
s_add_i32 s3, s2, s3
s_mul_i32 s2, s10, s8
v_mov_b64_e32 v[6:7], s[10:11]
v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7]
s_mov_b64 s[20:21], s[6:7]
s_and_b64 s[6:7], vcc, exec
s_load_dwordx4 s[24:27], s[0:1], 0x38
s_cselect_b32 s30, s2, s10
s_load_dwordx2 s[0:1], s[0:1], 0x38
s_mul_i32 s2, s14, s13
s_mul_hi_u32 s3, s14, s12
v_mov_b32_e32 v3, s15
s_add_i32 s2, s3, s2
s_mul_i32 s15, s15, s12
v_mov_b32_e32 v2, s14
s_add_i32 s3, s2, s15
s_mul_i32 s2, s14, s12
v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
s_and_b64 s[8:9], vcc, exec
s_cselect_b32 s6, s2, s14
s_waitcnt lgkmcnt(0)
s_mul_i32 s1, s26, s1
s_mul_hi_u32 s2, s26, s0
s_add_i32 s1, s2, s1
s_mul_i32 s2, s27, s0
v_mov_b32_e32 v4, s26
v_mov_b32_e32 v5, s27
s_add_i32 s1, s1, s2
s_mul_i32 s0, s26, s0
v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
s_and_b64 s[2:3], vcc, exec
s_cselect_b32 s0, s0, s26
s_lshl_b32 s22, s0, 2
s_and_b32 s21, s21, 0xffff
s_mov_b32 s7, 0x27000
v_lshl_add_u32 v1, v0, 2, 0
v_mov_b32_e32 v2, 0
s_cmp_eq_u64 s[10:11], 0
s_mov_b64 s[0:1], 0
ds_write_b32 v1, v2
s_cbranch_scc1 .LBB0_10
s_and_b32 s29, s29, 0xffff
s_and_b32 s5, s5, 0xffff
s_mul_i32 s2, s17, s10
s_lshl_b32 s12, s14, 6
v_add_u32_e32 v4, s2, v0
v_mul_lo_u32 v2, v0, s14
s_add_u32 s2, 0, s10
v_add_u32_e32 v5, s16, v2
s_addc_u32 s3, s11, 0
v_mov_b64_e32 v[2:3], s[10:11]
s_mov_b32 s31, s7
v_mov_b32_e32 v6, 0
s_branch .LBB0_3
.LBB0_2:
s_or_b64 exec, exec, s[8:9]
s_add_u32 s0, s0, 64
s_addc_u32 s1, s1, 0
s_add_u32 s2, s2, 0xffffffc0
v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
v_add_u32_e32 v4, 64, v4
v_add_u32_e32 v5, s12, v5
s_addc_u32 s3, s3, -1
s_waitcnt lgkmcnt(0)
s_cbranch_vccz .LBB0_9
.LBB0_3:
v_cmp_lt_i64_e64 s[8:9], s[2:3], 64
s_and_b64 s[8:9], s[8:9], exec
s_cselect_b32 s8, s2, 64
s_waitcnt vmcnt(1)
v_sub_u32_e32 v7, s8, v0
v_cmp_lt_i32_e32 vcc, 0, v7
v_mov_b32_e32 v7, 0
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execz .LBB0_5
buffer_load_sbyte v7, v5, s[4:7], 0 offen
.LBB0_5:
s_or_b64 exec, exec, s[8:9]
buffer_load_ubyte v8, v4, s[28:31], 0 offen
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v9, 0
s_and_saveexec_b64 s[8:9], vcc
ds_read_b32 v9, v1
s_or_b64 exec, exec, s[8:9]
s_and_saveexec_b64 s[8:9], vcc
s_cbranch_execz .LBB0_2
s_waitcnt vmcnt(0)
v_cndmask_b32_sdwa v8, v6, -v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
s_waitcnt lgkmcnt(0)
v_mad_i32_i24 v7, v8, v7, v9
ds_write_b32 v1, v7
s_branch .LBB0_2
.LBB0_9:
ds_read_b32 v2, v1
.LBB0_10:
v_mbcnt_lo_u32_b32 v1, -1, 0
v_mbcnt_hi_u32_b32 v3, -1, v1
v_and_b32_e32 v1, 64, v3
v_add_u32_e32 v4, 64, v1
v_xor_b32_e32 v1, 1, v3
v_cmp_lt_i32_e32 vcc, v1, v4
v_xor_b32_e32 v5, 2, v3
s_nop 0
v_cndmask_b32_e32 v1, v3, v1, vcc
v_lshlrev_b32_e32 v1, 2, v1
s_waitcnt lgkmcnt(0)
ds_bpermute_b32 v1, v1, v2
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 4, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 8, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 16, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_xor_b32_e32 v5, 32, v3
v_cmp_lt_i32_e32 vcc, v5, v4
s_waitcnt lgkmcnt(0)
v_add_u32_e32 v1, v1, v2
v_cndmask_b32_e32 v2, v3, v5, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v2, v2, v1
v_cmp_eq_u32_e32 vcc, 0, v0
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_12
s_mul_i32 s0, s26, s17
s_lshl_b32 s0, s0, 2
s_lshl_b32 s1, s16, 2
s_add_i32 s0, s0, s1
s_mov_b32 s23, s7
v_mov_b32_e32 v0, s0
buffer_load_dword v3, v0, s[20:23], 0 offen
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add3_u32 v1, v1, v2, v3
buffer_store_dword v1, v0, s[20:23], 0 offen
.LBB0_12:
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 72
.amdhsa_user_sgpr_count 16
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 14
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 10
.amdhsa_next_free_sgpr 32
.amdhsa_accum_offset 12
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 10
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0
.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .actual_access: read_only
.address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .actual_access: read_only
.address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 16
.size: 8
.value_kind: global_buffer
- .offset: 24
.size: 4
.value_kind: by_value
- .offset: 28
.size: 4
.value_kind: by_value
- .offset: 32
.size: 4
.value_kind: by_value
- .offset: 36
.size: 4
.value_kind: by_value
- .offset: 40
.size: 4
.value_kind: by_value
- .offset: 44
.size: 4
.value_kind: by_value
- .offset: 48
.size: 4
.value_kind: by_value
- .offset: 52
.size: 4
.value_kind: by_value
- .offset: 56
.size: 4
.value_kind: by_value
- .offset: 60
.size: 4
.value_kind: by_value
- .offset: 64
.size: 4
.value_kind: by_value
- .offset: 68
.size: 4
.value_kind: by_value
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 72
.max_flat_workgroup_size: 64
.name: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
.private_segment_fixed_size: 0
.reqd_workgroup_size:
- 64
- 1
- 1
.sgpr_count: 38
.sgpr_spill_count: 0
.symbol: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 10
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
; To reproduce the .optimized.ll from the .linked.ll, run:
; opt --passes='verify,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>),openmp-opt,amdgpu-printf-runtime-binding,amdgpu-unify-metadata,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>),always-inline,require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline,function-attrs<skip-non-recursive-function-attrs>,openmp-opt-cgscc,function(amdgpu-promote-kernel-arguments,infer-address-spaces,amdgpu-lower-kernel-attributes,amdgpu-promote-alloca-to-vector),function<eager-inv;no-rerun>(sroa<modify-cfg>,early-cse<memssa>,speculative-execution<only-if-divergent-target>,jump-threading,correlated-propagation,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine,libcalls-shrinkwrap,amdgpu-usenative,amdgpu-simplifylib,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,reassociate,constraint-elimination,loop-mssa(loop-instsimplify,loop-simplifycfg,licm<no-allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>,licm<allowspeculation>,simple-loop-unswitch<no-nontrivial;trivial>),simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,loop(loop-idiom,indvars,extra-simple-loop-unswitch-passes,loop-deletion,loop-unroll-full),sroa<modify-cfg>,vector-combine,mldst-motion<no-split-footer-bb>,gvn<>,sccp,bdce,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,jump-threading,correlated-propagation,adce,memcpyopt,dse,move-auto-init,loop-mssa(licm<allowspeculation>),coro-elide,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib),function-attrs,function(require<should-not-run-function-passes>),coro-split,coro-annotation-elide)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,function<eager-inv>(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion),loop-distribute,inject-tli-mappings,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-load-elim,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,slp-vectorizer,vector-combine,instcombine<max-iterations=1;no-verify-fixpoint>,loop-unroll<O2>,transform-warning,sroa<preserve-cfg>,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,loop-sink,instsimplify,div-rem-pairs,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),amdgpu-attributor,globaldce,constmerge,cg-profile,rel-lookup-table-converter,function(annotation-remarks),verify'
; ModuleID = 'matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0'
source_filename = "matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
@__dynamic_shared_memory__ = external addrspace(3) global [0 x i8], align 16
; Function Attrs: alwaysinline nofree norecurse nounwind
define amdgpu_kernel void @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32(ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %0, ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %1, ptr addrspace(1) inreg noalias noundef nonnull align 16 %2, i32 inreg noundef %3, i32 inreg noundef %4, i32 inreg noundef %5, i32 inreg noundef %6, i32 inreg noundef %7, i32 inreg noundef %8, i32 inreg noundef %9, i32 inreg noundef %10, i32 inreg noundef %11, i32 inreg noundef %12, i32 inreg noundef %13, i32 inreg noundef %14) local_unnamed_addr #0 !reqd_work_group_size !2 {
%16 = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
%17 = zext nneg i32 %16 to i64
%18 = zext i32 %3 to i64
%19 = zext i32 %4 to i64
%20 = shl nuw i64 %19, 32
%21 = or disjoint i64 %20, %18
%22 = zext i32 %5 to i64
%23 = zext i32 %6 to i64
%24 = shl nuw i64 %23, 32
%25 = or disjoint i64 %24, %22
%26 = zext i32 %7 to i64
%27 = zext i32 %8 to i64
%28 = shl nuw i64 %27, 32
%29 = or disjoint i64 %28, %26
%30 = zext i32 %9 to i64
%31 = zext i32 %10 to i64
%32 = shl nuw i64 %31, 32
%33 = or disjoint i64 %32, %30
%34 = zext i32 %11 to i64
%35 = zext i32 %12 to i64
%36 = shl nuw i64 %35, 32
%37 = or disjoint i64 %36, %34
%38 = zext i32 %13 to i64
%39 = zext nneg i32 %14 to i64
%40 = shl nuw nsw i64 %39, 32
%41 = or disjoint i64 %40, %38
%42 = icmp ult i32 %4, 2097152
tail call void @llvm.assume(i1 %42)
%43 = icmp ult i32 %6, 2097152
tail call void @llvm.assume(i1 %43)
%44 = icmp ult i32 %8, 2097152
tail call void @llvm.assume(i1 %44)
%45 = icmp ult i32 %10, 2097152
tail call void @llvm.assume(i1 %45)
%46 = icmp ult i32 %12, 2097152
tail call void @llvm.assume(i1 %46)
%47 = icmp ult i32 %14, 2097152
tail call void @llvm.assume(i1 %47)
%48 = mul i64 %25, %21
%49 = tail call i64 @llvm.umax.i64(i64 %48, i64 %25)
%50 = trunc i64 %49 to i32
%51 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %0, i16 0, i32 %50, i32 159744)
call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %51, i64 64) ]
%52 = mul i64 %33, %29
%53 = tail call i64 @llvm.umax.i64(i64 %52, i64 %33)
%54 = trunc i64 %53 to i32
%55 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %1, i16 0, i32 %54, i32 159744)
call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %55, i64 64) ]
%56 = mul i64 %41, %37
%57 = tail call i64 @llvm.umax.i64(i64 %56, i64 %41)
%58 = trunc i64 %57 to i32
%59 = shl i32 %58, 2
%60 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %2, i16 0, i32 %59, i32 159744)
call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %60, i64 64) ]
%61 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.x()
%62 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.y()
%63 = getelementptr i32, ptr addrspace(3) @__dynamic_shared_memory__, i32 %16
store <1 x i32> zeroinitializer, ptr addrspace(3) %63, align 4
%invariant.gep = getelementptr i8, ptr addrspace(7) %55, i32 %61
%.not = icmp eq i64 %25, 0
br i1 %.not, label %._crit_edge, label %.lr.ph
.lr.ph: ; preds = %15
%64 = zext nneg i32 %62 to i64
%65 = mul nuw nsw i64 %22, %64
br label %66
66: ; preds = %.lr.ph, %82
%67 = phi i64 [ 0, %.lr.ph ], [ %93, %82 ]
%68 = sub nuw nsw i64 %25, %67
%69 = tail call i64 @llvm.smin.i64(i64 %68, i64 64)
%70 = trunc nuw nsw i64 %69 to i32
%71 = sub i32 %70, %16
%72 = insertelement <1 x i32> poison, i32 %71, i64 0
%73 = icmp sgt <1 x i32> %72, zeroinitializer
%74 = or disjoint i64 %67, %17
%75 = extractelement <1 x i1> %73, i64 0
br i1 %75, label %76, label %82
76: ; preds = %66
%77 = trunc i64 %74 to i32
%78 = mul i32 %9, %77
%gep = getelementptr i8, ptr addrspace(7) %invariant.gep, i32 %78
%79 = load i8, ptr addrspace(7) %gep, align 1
%80 = insertelement <1 x i8> poison, i8 %79, i64 0
%81 = sext <1 x i8> %80 to <1 x i32>
br label %82
82: ; preds = %66, %76
%83 = phi <1 x i32> [ %81, %76 ], [ zeroinitializer, %66 ]
%84 = add nuw i64 %74, %65
%85 = trunc i64 %84 to i32
%86 = getelementptr i8, ptr addrspace(7) %51, i32 %85
%87 = load <1 x i8>, ptr addrspace(7) %86, align 1
%88 = sext <1 x i8> %87 to <1 x i32>
%89 = select <1 x i1> %73, <1 x i32> %88, <1 x i32> zeroinitializer
%90 = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) %63, i32 4, <1 x i1> %73, <1 x i32> zeroinitializer)
%91 = mul nsw <1 x i32> %89, %83
%92 = add <1 x i32> %91, %90
tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %92, ptr addrspace(3) %63, i32 4, <1 x i1> %73)
tail call void @llvm.amdgcn.s.waitcnt(i32 -7937)
tail call void @llvm.amdgcn.s.barrier()
%93 = add nuw nsw i64 %67, 64
%94 = icmp samesign ult i64 %93, %25
br i1 %94, label %66, label %._crit_edge.loopexit
._crit_edge.loopexit: ; preds = %82
%.pre = load <1 x i32>, ptr addrspace(3) %63, align 4
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %15
%95 = phi <1 x i32> [ %.pre, %._crit_edge.loopexit ], [ zeroinitializer, %15 ]
%96 = extractelement <1 x i32> %95, i64 0
%97 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%98 = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %97)
%99 = and i32 %98, -64
%100 = add i32 %99, 64
%101 = xor i32 %98, 1
%102 = icmp slt i32 %101, %100
%103 = select i1 %102, i32 %101, i32 %98
%104 = shl i32 %103, 2
%105 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %104, i32 %96)
%106 = add i32 %105, %96
%107 = xor i32 %98, 2
%108 = icmp slt i32 %107, %100
%109 = select i1 %108, i32 %107, i32 %98
%110 = shl i32 %109, 2
%111 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %110, i32 %106)
%112 = add i32 %106, %111
%113 = xor i32 %98, 4
%114 = icmp slt i32 %113, %100
%115 = select i1 %114, i32 %113, i32 %98
%116 = shl i32 %115, 2
%117 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %116, i32 %112)
%118 = add i32 %112, %117
%119 = xor i32 %98, 8
%120 = icmp slt i32 %119, %100
%121 = select i1 %120, i32 %119, i32 %98
%122 = shl i32 %121, 2
%123 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %122, i32 %118)
%124 = add i32 %118, %123
%125 = xor i32 %98, 16
%126 = icmp slt i32 %125, %100
%127 = select i1 %126, i32 %125, i32 %98
%128 = shl i32 %127, 2
%129 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %128, i32 %124)
%130 = add i32 %124, %129
%131 = xor i32 %98, 32
%132 = icmp slt i32 %131, %100
%133 = select i1 %132, i32 %131, i32 %98
%134 = shl i32 %133, 2
%135 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %134, i32 %130)
%136 = icmp eq i32 %16, 0
br i1 %136, label %137, label %146
137: ; preds = %._crit_edge
%138 = add i32 %130, %135
%139 = mul i32 %13, %62
%140 = getelementptr i32, ptr addrspace(7) %60, i32 %139
%141 = getelementptr i32, ptr addrspace(7) %140, i32 %61
%142 = load <1 x i32>, ptr addrspace(7) %141, align 4
%143 = extractelement <1 x i32> %142, i64 0
%144 = add i32 %138, %143
%145 = insertelement <1 x i32> poison, i32 %144, i64 0
store <1 x i32> %145, ptr addrspace(7) %141, align 4
br label %146
146: ; preds = %137, %._crit_edge
ret void
}
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #2
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.umax.i64(i64, i64) #1
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) readnone, i16, i32, i32) #1
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.amdgcn.workgroup.id.y() #1
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none)
declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #4
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.smin.i64(i64, i64) #1
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
declare <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) captures(none), i32 immarg, <1 x i1>, <1 x i32>) #5
; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
declare void @llvm.masked.store.v1i32.p3(<1 x i32>, ptr addrspace(3) captures(none), i32 immarg, <1 x i1>) #6
; Function Attrs: alwaysinline mustprogress nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.s.waitcnt(i32 immarg) #7
; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.s.barrier() #8
attributes #0 = { alwaysinline nofree norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
attributes #1 = { alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #3 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
attributes #4 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
attributes #5 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
attributes #6 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
attributes #7 = { alwaysinline mustprogress nocallback nofree nounwind willreturn }
attributes #8 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn }
!llvm.module.flags = !{!0, !1}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
!2 = !{i32 64, i32 1, i32 1}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Morty Proxy This is a proxified and sanitized view of the page, visit original site.