lialan · May 6, 2025 21:19
diff --git a/assembly with suspected patch b/assembly with suspected patch
 	.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
 	.amdhsa_code_object_version 5
 	.text
 	.globl	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
 	.p2align	8
 	.type	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function
 matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32:
 	s_load_dwordx2 s[2:3], s[0:1], 0x0
 	s_load_dwordx8 s[4:11], s[0:1], 0x8
 	s_load_dwordx4 s[12:15], s[0:1], 0x28
 	s_waitcnt lgkmcnt(0)
 	s_branch .LBB0_0
 	.p2align	8
 .LBB0_0:
 	s_mov_b64 s[28:29], s[2:3]
 	s_mul_i32 s2, s10, s9
 	s_mul_hi_u32 s3, s10, s8
 	s_add_i32 s2, s3, s2
 	s_mul_i32 s3, s11, s8
 	s_add_i32 s3, s2, s3
 	s_mul_i32 s2, s10, s8
 	v_mov_b64_e32 v[6:7], s[10:11]
 	v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7]
 	s_mov_b64 s[20:21], s[6:7]
 	s_and_b64 s[6:7], vcc, exec
 	s_load_dwordx4 s[24:27], s[0:1], 0x38
 	s_cselect_b32 s30, s2, s10
 	s_load_dwordx2 s[0:1], s[0:1], 0x38
 	s_mul_i32 s2, s14, s13
 	s_mul_hi_u32 s3, s14, s12
 	v_mov_b32_e32 v3, s15
 	s_add_i32 s2, s3, s2
 	s_mul_i32 s15, s15, s12
 	v_mov_b32_e32 v2, s14
 	s_add_i32 s3, s2, s15
 	s_mul_i32 s2, s14, s12
 	v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
 	s_and_b64 s[8:9], vcc, exec
 	s_cselect_b32 s6, s2, s14
 	s_waitcnt lgkmcnt(0)
 	s_mul_i32 s1, s26, s1
 	s_mul_hi_u32 s2, s26, s0
 	s_add_i32 s1, s2, s1
 	s_mul_i32 s2, s27, s0
 	v_mov_b32_e32 v4, s26
 	v_mov_b32_e32 v5, s27
 	s_add_i32 s1, s1, s2
 	s_mul_i32 s0, s26, s0
 	v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
 	s_and_b64 s[2:3], vcc, exec
 	s_cselect_b32 s0, s0, s26
 	s_lshl_b32 s22, s0, 2
 	s_and_b32 s21, s21, 0xffff
 	s_mov_b32 s7, 0x27000
 	v_lshl_add_u32 v1, v0, 2, 0
 	v_mov_b32_e32 v2, 0
 	s_cmp_eq_u64 s[10:11], 0
 	s_mov_b64 s[0:1], 0
 	ds_write_b32 v1, v2
 	s_cbranch_scc1 .LBB0_10
 	s_and_b32 s29, s29, 0xffff
 	s_and_b32 s5, s5, 0xffff
 	s_mul_i32 s2, s17, s10
 	s_lshl_b32 s12, s14, 6
 	v_add_u32_e32 v4, s2, v0
 	v_mul_lo_u32 v2, v0, s14
 	s_add_u32 s2, 0, s10
 	v_add_u32_e32 v5, s16, v2
 	s_addc_u32 s3, s11, 0
 	v_mov_b64_e32 v[2:3], s[10:11]
 	s_mov_b32 s31, s7
 	s_branch .LBB0_3
 .LBB0_2:
 	s_or_b64 exec, exec, s[8:9]
 	s_add_u32 s0, s0, 64
 	s_addc_u32 s1, s1, 0
 	s_add_u32 s2, s2, 0xffffffc0
 	v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 	v_add_u32_e32 v4, 64, v4
 	v_add_u32_e32 v5, s12, v5
 	s_addc_u32 s3, s3, -1
 	s_waitcnt lgkmcnt(0)
 	s_cbranch_vccz .LBB0_9
 .LBB0_3:
 	v_cmp_lt_i64_e64 s[8:9], s[2:3], 64
 	s_and_b64 s[8:9], s[8:9], exec
 	s_cselect_b32 s8, s2, 64
 	s_waitcnt vmcnt(1)
 	v_sub_u32_e32 v7, s8, v0
 	s_waitcnt lgkmcnt(0)
 	v_mov_b32_e32 v6, 0
 	v_cmp_lt_i32_e32 vcc, 0, v7
 	v_mov_b32_e32 v7, 0
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execnz .LBB0_6
 	s_or_b64 exec, exec, s[8:9]
 	buffer_load_ubyte v8, v4, s[28:31], 0 offen
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execnz .LBB0_7
 .LBB0_5:
 	s_or_b64 exec, exec, s[8:9]
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execz .LBB0_2
 	s_branch .LBB0_8
 .LBB0_6:
 	buffer_load_sbyte v7, v5, s[4:7], 0 offen
 	s_or_b64 exec, exec, s[8:9]
 	buffer_load_ubyte v8, v4, s[28:31], 0 offen
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execz .LBB0_5
 .LBB0_7:
 	ds_read_b32 v6, v1
 	s_or_b64 exec, exec, s[8:9]
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execz .LBB0_2
 .LBB0_8:
 	s_waitcnt vmcnt(0)
 	v_bfe_i32 v8, v8, 0, 8
 	v_cndmask_b32_e32 v8, 0, v8, vcc
 	s_waitcnt lgkmcnt(0)
 	v_mad_i32_i24 v6, v8, v7, v6
 	ds_write_b32 v1, v6
 	s_branch .LBB0_2
 .LBB0_9:
 	ds_read_b32 v2, v1
 .LBB0_10:
 	v_mbcnt_lo_u32_b32 v1, -1, 0
 	v_mbcnt_hi_u32_b32 v3, -1, v1
 	v_and_b32_e32 v1, 64, v3
 	v_add_u32_e32 v4, 64, v1
 	v_xor_b32_e32 v1, 1, v3
 	v_cmp_lt_i32_e32 vcc, v1, v4
 	v_xor_b32_e32 v5, 2, v3
 	s_nop 0
 	v_cndmask_b32_e32 v1, v3, v1, vcc
 	v_lshlrev_b32_e32 v1, 2, v1
 	s_waitcnt lgkmcnt(0)
 	ds_bpermute_b32 v1, v1, v2
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 4, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 8, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 16, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 32, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_cmp_eq_u32_e32 vcc, 0, v0
 	s_and_saveexec_b64 s[0:1], vcc
 	s_cbranch_execz .LBB0_12
 	s_mul_i32 s0, s26, s17
 	s_lshl_b32 s0, s0, 2
 	s_lshl_b32 s1, s16, 2
 	s_add_i32 s0, s0, s1
 	s_mov_b32 s23, s7
 	v_mov_b32_e32 v0, s0
 	buffer_load_dword v3, v0, s[20:23], 0 offen
 	s_waitcnt vmcnt(0) lgkmcnt(0)
 	v_add3_u32 v1, v1, v2, v3
 	buffer_store_dword v1, v0, s[20:23], 0 offen
 .LBB0_12:
 	s_endpgm
 	.section	.rodata,"a",@progbits
 	.p2align	6, 0x0
 	.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
 		.amdhsa_group_segment_fixed_size 0
 		.amdhsa_private_segment_fixed_size 0
 		.amdhsa_kernarg_size 72
 		.amdhsa_user_sgpr_count 16
 		.amdhsa_user_sgpr_dispatch_ptr 0
 		.amdhsa_user_sgpr_queue_ptr 0
 		.amdhsa_user_sgpr_kernarg_segment_ptr 1
 		.amdhsa_user_sgpr_dispatch_id 0
 		.amdhsa_user_sgpr_kernarg_preload_length 14
 		.amdhsa_user_sgpr_kernarg_preload_offset 0
 		.amdhsa_user_sgpr_private_segment_size 0
 		.amdhsa_uses_dynamic_stack 0
 		.amdhsa_enable_private_segment 0
 		.amdhsa_system_sgpr_workgroup_id_x 1
 		.amdhsa_system_sgpr_workgroup_id_y 1
 		.amdhsa_system_sgpr_workgroup_id_z 0
 		.amdhsa_system_sgpr_workgroup_info 0
 		.amdhsa_system_vgpr_workitem_id 0
 		.amdhsa_next_free_vgpr 9
 		.amdhsa_next_free_sgpr 32
 		.amdhsa_accum_offset 12
 		.amdhsa_reserve_vcc 1
 		.amdhsa_reserve_xnack_mask 1
 		.amdhsa_float_round_mode_32 0
 		.amdhsa_float_round_mode_16_64 0
 		.amdhsa_float_denorm_mode_32 3
 		.amdhsa_float_denorm_mode_16_64 3
 		.amdhsa_dx10_clamp 1
 		.amdhsa_ieee_mode 1
 		.amdhsa_fp16_overflow 0
 		.amdhsa_tg_split 0
 		.amdhsa_exception_fp_ieee_invalid_op 0
 		.amdhsa_exception_fp_denorm_src 0
 		.amdhsa_exception_fp_ieee_div_zero 0
 		.amdhsa_exception_fp_ieee_overflow 0
 		.amdhsa_exception_fp_ieee_underflow 0
 		.amdhsa_exception_fp_ieee_inexact 0
 		.amdhsa_exception_int_div_zero 0
 	.end_amdhsa_kernel
 	.text
 .Lfunc_end0:
 	.size	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32

 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 9
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0
 	.p2alignl 6, 3212836864
 	.fill 256, 4, 3212836864
 	.section	.AMDGPU.gpr_maximums,"",@progbits
 	.set amdgpu.max_num_vgpr, 0
 	.set amdgpu.max_num_agpr, 0
 	.set amdgpu.max_num_sgpr, 0
 	.text
 	.section	".note.GNU-stack","",@progbits
 	.amdgpu_metadata
 ---
 amdhsa.kernels:
  - .agpr_count:     0
    .args:
      - .actual_access:  read_only
        .address_space:  global
        .offset:         0
        .size:           8
        .value_kind:     global_buffer
      - .actual_access:  read_only
        .address_space:  global
        .offset:         8
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .offset:         16
        .size:           8
        .value_kind:     global_buffer
      - .offset:         24
        .size:           4
        .value_kind:     by_value
      - .offset:         28
        .size:           4
        .value_kind:     by_value
      - .offset:         32
        .size:           4
        .value_kind:     by_value
      - .offset:         36
        .size:           4
        .value_kind:     by_value
      - .offset:         40
        .size:           4
        .value_kind:     by_value
      - .offset:         44
        .size:           4
        .value_kind:     by_value
      - .offset:         48
        .size:           4
        .value_kind:     by_value
      - .offset:         52
        .size:           4
        .value_kind:     by_value
      - .offset:         56
        .size:           4
        .value_kind:     by_value
      - .offset:         60
        .size:           4
        .value_kind:     by_value
      - .offset:         64
        .size:           4
        .value_kind:     by_value
      - .offset:         68
        .size:           4
        .value_kind:     by_value
    .group_segment_fixed_size: 0
    .kernarg_segment_align: 8
    .kernarg_segment_size: 72
    .max_flat_workgroup_size: 64
    .name:           matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
    .private_segment_fixed_size: 0
    .reqd_workgroup_size:
      - 64
      - 1
      - 1
    .sgpr_count:     38
    .sgpr_spill_count: 0
    .symbol:         matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd
    .uniform_work_group_size: 1
    .uses_dynamic_stack: false
    .vgpr_count:     9
    .vgpr_spill_count: 0
    .wavefront_size: 64
 amdhsa.target:   amdgcn-amd-amdhsa--gfx942
 amdhsa.version:
  - 1
  - 2
 ...

 	.end_amdgpu_metadata
diff --git a/assembly with suspected patch reverted b/assembly with suspected patch reverted
 	.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
 	.amdhsa_code_object_version 5
 	.text
 	.globl	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
 	.p2align	8
 	.type	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function
 matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32:
 	s_load_dwordx2 s[2:3], s[0:1], 0x0
 	s_load_dwordx8 s[4:11], s[0:1], 0x8
 	s_load_dwordx4 s[12:15], s[0:1], 0x28
 	s_waitcnt lgkmcnt(0)
 	s_branch .LBB0_0
 	.p2align	8
 .LBB0_0:
 	s_mov_b64 s[28:29], s[2:3]
 	s_mul_i32 s2, s10, s9
 	s_mul_hi_u32 s3, s10, s8
 	s_add_i32 s2, s3, s2
 	s_mul_i32 s3, s11, s8
 	s_add_i32 s3, s2, s3
 	s_mul_i32 s2, s10, s8
 	v_mov_b64_e32 v[6:7], s[10:11]
 	v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7]
 	s_mov_b64 s[20:21], s[6:7]
 	s_and_b64 s[6:7], vcc, exec
 	s_load_dwordx4 s[24:27], s[0:1], 0x38
 	s_cselect_b32 s30, s2, s10
 	s_load_dwordx2 s[0:1], s[0:1], 0x38
 	s_mul_i32 s2, s14, s13
 	s_mul_hi_u32 s3, s14, s12
 	v_mov_b32_e32 v3, s15
 	s_add_i32 s2, s3, s2
 	s_mul_i32 s15, s15, s12
 	v_mov_b32_e32 v2, s14
 	s_add_i32 s3, s2, s15
 	s_mul_i32 s2, s14, s12
 	v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
 	s_and_b64 s[8:9], vcc, exec
 	s_cselect_b32 s6, s2, s14
 	s_waitcnt lgkmcnt(0)
 	s_mul_i32 s1, s26, s1
 	s_mul_hi_u32 s2, s26, s0
 	s_add_i32 s1, s2, s1
 	s_mul_i32 s2, s27, s0
 	v_mov_b32_e32 v4, s26
 	v_mov_b32_e32 v5, s27
 	s_add_i32 s1, s1, s2
 	s_mul_i32 s0, s26, s0
 	v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
 	s_and_b64 s[2:3], vcc, exec
 	s_cselect_b32 s0, s0, s26
 	s_lshl_b32 s22, s0, 2
 	s_and_b32 s21, s21, 0xffff
 	s_mov_b32 s7, 0x27000
 	v_lshl_add_u32 v1, v0, 2, 0
 	v_mov_b32_e32 v2, 0
 	s_cmp_eq_u64 s[10:11], 0
 	s_mov_b64 s[0:1], 0
 	ds_write_b32 v1, v2
 	s_cbranch_scc1 .LBB0_10
 	s_and_b32 s29, s29, 0xffff
 	s_and_b32 s5, s5, 0xffff
 	s_mul_i32 s2, s17, s10
 	s_lshl_b32 s12, s14, 6
 	v_add_u32_e32 v4, s2, v0
 	v_mul_lo_u32 v2, v0, s14
 	s_add_u32 s2, 0, s10
 	v_add_u32_e32 v5, s16, v2
 	s_addc_u32 s3, s11, 0
 	v_mov_b64_e32 v[2:3], s[10:11]
 	s_mov_b32 s31, s7
 	v_mov_b32_e32 v6, 0
 	s_branch .LBB0_3
 .LBB0_2:
 	s_or_b64 exec, exec, s[8:9]
 	s_add_u32 s0, s0, 64
 	s_addc_u32 s1, s1, 0
 	s_add_u32 s2, s2, 0xffffffc0
 	v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 	v_add_u32_e32 v4, 64, v4
 	v_add_u32_e32 v5, s12, v5
 	s_addc_u32 s3, s3, -1
 	s_waitcnt lgkmcnt(0)
 	s_cbranch_vccz .LBB0_9
 .LBB0_3:
 	v_cmp_lt_i64_e64 s[8:9], s[2:3], 64
 	s_and_b64 s[8:9], s[8:9], exec
 	s_cselect_b32 s8, s2, 64
 	s_waitcnt vmcnt(1)
 	v_sub_u32_e32 v7, s8, v0
 	v_cmp_lt_i32_e32 vcc, 0, v7
 	v_mov_b32_e32 v7, 0
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execz .LBB0_5
 	buffer_load_sbyte v7, v5, s[4:7], 0 offen
 .LBB0_5:
 	s_or_b64 exec, exec, s[8:9]
 	buffer_load_ubyte v8, v4, s[28:31], 0 offen
 	s_waitcnt lgkmcnt(0)
 	v_mov_b32_e32 v9, 0
 	s_and_saveexec_b64 s[8:9], vcc
 	ds_read_b32 v9, v1
 	s_or_b64 exec, exec, s[8:9]
 	s_and_saveexec_b64 s[8:9], vcc
 	s_cbranch_execz .LBB0_2
 	s_waitcnt vmcnt(0)
 	v_cndmask_b32_sdwa v8, v6, -v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 	s_waitcnt lgkmcnt(0)
 	v_mad_i32_i24 v7, v8, v7, v9
 	ds_write_b32 v1, v7
 	s_branch .LBB0_2
 .LBB0_9:
 	ds_read_b32 v2, v1
 .LBB0_10:
 	v_mbcnt_lo_u32_b32 v1, -1, 0
 	v_mbcnt_hi_u32_b32 v3, -1, v1
 	v_and_b32_e32 v1, 64, v3
 	v_add_u32_e32 v4, 64, v1
 	v_xor_b32_e32 v1, 1, v3
 	v_cmp_lt_i32_e32 vcc, v1, v4
 	v_xor_b32_e32 v5, 2, v3
 	s_nop 0
 	v_cndmask_b32_e32 v1, v3, v1, vcc
 	v_lshlrev_b32_e32 v1, 2, v1
 	s_waitcnt lgkmcnt(0)
 	ds_bpermute_b32 v1, v1, v2
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 4, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 8, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 16, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_xor_b32_e32 v5, 32, v3
 	v_cmp_lt_i32_e32 vcc, v5, v4
 	s_waitcnt lgkmcnt(0)
 	v_add_u32_e32 v1, v1, v2
 	v_cndmask_b32_e32 v2, v3, v5, vcc
 	v_lshlrev_b32_e32 v2, 2, v2
 	ds_bpermute_b32 v2, v2, v1
 	v_cmp_eq_u32_e32 vcc, 0, v0
 	s_and_saveexec_b64 s[0:1], vcc
 	s_cbranch_execz .LBB0_12
 	s_mul_i32 s0, s26, s17
 	s_lshl_b32 s0, s0, 2
 	s_lshl_b32 s1, s16, 2
 	s_add_i32 s0, s0, s1
 	s_mov_b32 s23, s7
 	v_mov_b32_e32 v0, s0
 	buffer_load_dword v3, v0, s[20:23], 0 offen
 	s_waitcnt vmcnt(0) lgkmcnt(0)
 	v_add3_u32 v1, v1, v2, v3
 	buffer_store_dword v1, v0, s[20:23], 0 offen
 .LBB0_12:
 	s_endpgm
 	.section	.rodata,"a",@progbits
 	.p2align	6, 0x0
 	.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
 		.amdhsa_group_segment_fixed_size 0
 		.amdhsa_private_segment_fixed_size 0
 		.amdhsa_kernarg_size 72
 		.amdhsa_user_sgpr_count 16
 		.amdhsa_user_sgpr_dispatch_ptr 0
 		.amdhsa_user_sgpr_queue_ptr 0
 		.amdhsa_user_sgpr_kernarg_segment_ptr 1
 		.amdhsa_user_sgpr_dispatch_id 0
 		.amdhsa_user_sgpr_kernarg_preload_length 14
 		.amdhsa_user_sgpr_kernarg_preload_offset 0
 		.amdhsa_user_sgpr_private_segment_size 0
 		.amdhsa_uses_dynamic_stack 0
 		.amdhsa_enable_private_segment 0
 		.amdhsa_system_sgpr_workgroup_id_x 1
 		.amdhsa_system_sgpr_workgroup_id_y 1
 		.amdhsa_system_sgpr_workgroup_id_z 0
 		.amdhsa_system_sgpr_workgroup_info 0
 		.amdhsa_system_vgpr_workitem_id 0
 		.amdhsa_next_free_vgpr 10
 		.amdhsa_next_free_sgpr 32
 		.amdhsa_accum_offset 12
 		.amdhsa_reserve_vcc 1
 		.amdhsa_reserve_xnack_mask 1
 		.amdhsa_float_round_mode_32 0
 		.amdhsa_float_round_mode_16_64 0
 		.amdhsa_float_denorm_mode_32 3
 		.amdhsa_float_denorm_mode_16_64 3
 		.amdhsa_dx10_clamp 1
 		.amdhsa_ieee_mode 1
 		.amdhsa_fp16_overflow 0
 		.amdhsa_tg_split 0
 		.amdhsa_exception_fp_ieee_invalid_op 0
 		.amdhsa_exception_fp_denorm_src 0
 		.amdhsa_exception_fp_ieee_div_zero 0
 		.amdhsa_exception_fp_ieee_overflow 0
 		.amdhsa_exception_fp_ieee_underflow 0
 		.amdhsa_exception_fp_ieee_inexact 0
 		.amdhsa_exception_int_div_zero 0
 	.end_amdhsa_kernel
 	.text
 .Lfunc_end0:
 	.size	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32

 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 10
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0
 	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0
 	.p2alignl 6, 3212836864
 	.fill 256, 4, 3212836864
 	.section	.AMDGPU.gpr_maximums,"",@progbits
 	.set amdgpu.max_num_vgpr, 0
 	.set amdgpu.max_num_agpr, 0
 	.set amdgpu.max_num_sgpr, 0
 	.text
 	.section	".note.GNU-stack","",@progbits
 	.amdgpu_metadata
 ---
 amdhsa.kernels:
  - .agpr_count:     0
    .args:
      - .actual_access:  read_only
        .address_space:  global
        .offset:         0
        .size:           8
        .value_kind:     global_buffer
      - .actual_access:  read_only
        .address_space:  global
        .offset:         8
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .offset:         16
        .size:           8
        .value_kind:     global_buffer
      - .offset:         24
        .size:           4
        .value_kind:     by_value
      - .offset:         28
        .size:           4
        .value_kind:     by_value
      - .offset:         32
        .size:           4
        .value_kind:     by_value
      - .offset:         36
        .size:           4
        .value_kind:     by_value
      - .offset:         40
        .size:           4
        .value_kind:     by_value
      - .offset:         44
        .size:           4
        .value_kind:     by_value
      - .offset:         48
        .size:           4
        .value_kind:     by_value
      - .offset:         52
        .size:           4
        .value_kind:     by_value
      - .offset:         56
        .size:           4
        .value_kind:     by_value
      - .offset:         60
        .size:           4
        .value_kind:     by_value
      - .offset:         64
        .size:           4
        .value_kind:     by_value
      - .offset:         68
        .size:           4
        .value_kind:     by_value
    .group_segment_fixed_size: 0
    .kernarg_segment_align: 8
    .kernarg_segment_size: 72
    .max_flat_workgroup_size: 64
    .name:           matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
    .private_segment_fixed_size: 0
    .reqd_workgroup_size:
      - 64
      - 1
      - 1
    .sgpr_count:     38
    .sgpr_spill_count: 0
    .symbol:         matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd
    .uniform_work_group_size: 1
    .uses_dynamic_stack: false
    .vgpr_count:     10
    .vgpr_spill_count: 0
    .wavefront_size: 64
 amdhsa.target:   amdgcn-amd-amdhsa--gfx942
 amdhsa.version:
  - 1
  - 2
 ...

 	.end_amdgpu_metadata
diff --git a/Source LLVM IR b/Source LLVM IR

 ; To reproduce the .optimized.ll from the .linked.ll, run:
 ; opt --passes='verify,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>),openmp-opt,amdgpu-printf-runtime-binding,amdgpu-unify-metadata,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>),always-inline,require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline,function-attrs<skip-non-recursive-function-attrs>,openmp-opt-cgscc,function(amdgpu-promote-kernel-arguments,infer-address-spaces,amdgpu-lower-kernel-attributes,amdgpu-promote-alloca-to-vector),function<eager-inv;no-rerun>(sroa<modify-cfg>,early-cse<memssa>,speculative-execution<only-if-divergent-target>,jump-threading,correlated-propagation,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine,libcalls-shrinkwrap,amdgpu-usenative,amdgpu-simplifylib,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,reassociate,constraint-elimination,loop-mssa(loop-instsimplify,loop-simplifycfg,licm<no-allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>,licm<allowspeculation>,simple-loop-unswitch<no-nontrivial;trivial>),simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,loop(loop-idiom,indvars,extra-simple-loop-unswitch-passes,loop-deletion,loop-unroll-full),sroa<modify-cfg>,vector-combine,mldst-motion<no-split-footer-bb>,gvn<>,sccp,bdce,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,jump-threading,correlated-propagation,adce,memcpyopt,dse,move-auto-init,loop-mssa(licm<allowspeculation>),coro-elide,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib),function-attrs,function(require<should-not-run-function-passes>),coro-split,coro-annotation-elide)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,function<eager-inv>(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion),loop-distribute,inject-tli-mappings,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-load-elim,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,slp-vectorizer,vector-combine,instcombine<max-iterations=1;no-verify-fixpoint>,loop-unroll<O2>,transform-warning,sroa<preserve-cfg>,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,loop-sink,instsimplify,div-rem-pairs,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),amdgpu-attributor,globaldce,constmerge,cg-profile,rel-lookup-table-converter,function(annotation-remarks),verify'

 ; ModuleID = 'matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0'
 source_filename = "matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"

 @__dynamic_shared_memory__ = external addrspace(3) global [0 x i8], align 16

 ; Function Attrs: alwaysinline nofree norecurse nounwind
 define amdgpu_kernel void @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32(ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %0, ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %1, ptr addrspace(1) inreg noalias noundef nonnull align 16 %2, i32 inreg noundef %3, i32 inreg noundef %4, i32 inreg noundef %5, i32 inreg noundef %6, i32 inreg noundef %7, i32 inreg noundef %8, i32 inreg noundef %9, i32 inreg noundef %10, i32 inreg noundef %11, i32 inreg noundef %12, i32 inreg noundef %13, i32 inreg noundef %14) local_unnamed_addr #0 !reqd_work_group_size !2 {
  %16 = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
  %17 = zext nneg i32 %16 to i64
  %18 = zext i32 %3 to i64
  %19 = zext i32 %4 to i64
  %20 = shl nuw i64 %19, 32
  %21 = or disjoint i64 %20, %18
  %22 = zext i32 %5 to i64
  %23 = zext i32 %6 to i64
  %24 = shl nuw i64 %23, 32
  %25 = or disjoint i64 %24, %22
  %26 = zext i32 %7 to i64
  %27 = zext i32 %8 to i64
  %28 = shl nuw i64 %27, 32
  %29 = or disjoint i64 %28, %26
  %30 = zext i32 %9 to i64
  %31 = zext i32 %10 to i64
  %32 = shl nuw i64 %31, 32
  %33 = or disjoint i64 %32, %30
  %34 = zext i32 %11 to i64
  %35 = zext i32 %12 to i64
  %36 = shl nuw i64 %35, 32
  %37 = or disjoint i64 %36, %34
  %38 = zext i32 %13 to i64
  %39 = zext nneg i32 %14 to i64
  %40 = shl nuw nsw i64 %39, 32
  %41 = or disjoint i64 %40, %38
  %42 = icmp ult i32 %4, 2097152
  tail call void @llvm.assume(i1 %42)
  %43 = icmp ult i32 %6, 2097152
  tail call void @llvm.assume(i1 %43)
  %44 = icmp ult i32 %8, 2097152
  tail call void @llvm.assume(i1 %44)
  %45 = icmp ult i32 %10, 2097152
  tail call void @llvm.assume(i1 %45)
  %46 = icmp ult i32 %12, 2097152
  tail call void @llvm.assume(i1 %46)
  %47 = icmp ult i32 %14, 2097152
  tail call void @llvm.assume(i1 %47)
  %48 = mul i64 %25, %21
  %49 = tail call i64 @llvm.umax.i64(i64 %48, i64 %25)
  %50 = trunc i64 %49 to i32
  %51 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %0, i16 0, i32 %50, i32 159744)
  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %51, i64 64) ]
  %52 = mul i64 %33, %29
  %53 = tail call i64 @llvm.umax.i64(i64 %52, i64 %33)
  %54 = trunc i64 %53 to i32
  %55 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %1, i16 0, i32 %54, i32 159744)
  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %55, i64 64) ]
  %56 = mul i64 %41, %37
  %57 = tail call i64 @llvm.umax.i64(i64 %56, i64 %41)
  %58 = trunc i64 %57 to i32
  %59 = shl i32 %58, 2
  %60 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %2, i16 0, i32 %59, i32 159744)
  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %60, i64 64) ]
  %61 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.x()
  %62 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.y()
  %63 = getelementptr i32, ptr addrspace(3) @__dynamic_shared_memory__, i32 %16
  store <1 x i32> zeroinitializer, ptr addrspace(3) %63, align 4
  %invariant.gep = getelementptr i8, ptr addrspace(7) %55, i32 %61
  %.not = icmp eq i64 %25, 0
  br i1 %.not, label %._crit_edge, label %.lr.ph

 .lr.ph:                                           ; preds = %15
  %64 = zext nneg i32 %62 to i64
  %65 = mul nuw nsw i64 %22, %64
  br label %66

 66:                                               ; preds = %.lr.ph, %82
  %67 = phi i64 [ 0, %.lr.ph ], [ %93, %82 ]
  %68 = sub nuw nsw i64 %25, %67
  %69 = tail call i64 @llvm.smin.i64(i64 %68, i64 64)
  %70 = trunc nuw nsw i64 %69 to i32
  %71 = sub i32 %70, %16
  %72 = insertelement <1 x i32> poison, i32 %71, i64 0
  %73 = icmp sgt <1 x i32> %72, zeroinitializer
  %74 = or disjoint i64 %67, %17
  %75 = extractelement <1 x i1> %73, i64 0
  br i1 %75, label %76, label %82

 76:                                               ; preds = %66
  %77 = trunc i64 %74 to i32
  %78 = mul i32 %9, %77
  %gep = getelementptr i8, ptr addrspace(7) %invariant.gep, i32 %78
  %79 = load i8, ptr addrspace(7) %gep, align 1
  %80 = insertelement <1 x i8> poison, i8 %79, i64 0
  %81 = sext <1 x i8> %80 to <1 x i32>
  br label %82

 82:                                               ; preds = %66, %76
  %83 = phi <1 x i32> [ %81, %76 ], [ zeroinitializer, %66 ]
  %84 = add nuw i64 %74, %65
  %85 = trunc i64 %84 to i32
  %86 = getelementptr i8, ptr addrspace(7) %51, i32 %85
  %87 = load <1 x i8>, ptr addrspace(7) %86, align 1
  %88 = sext <1 x i8> %87 to <1 x i32>
  %89 = select <1 x i1> %73, <1 x i32> %88, <1 x i32> zeroinitializer
  %90 = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) %63, i32 4, <1 x i1> %73, <1 x i32> zeroinitializer)
  %91 = mul nsw <1 x i32> %89, %83
  %92 = add <1 x i32> %91, %90
  tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %92, ptr addrspace(3) %63, i32 4, <1 x i1> %73)
  tail call void @llvm.amdgcn.s.waitcnt(i32 -7937)
  tail call void @llvm.amdgcn.s.barrier()
  %93 = add nuw nsw i64 %67, 64
  %94 = icmp samesign ult i64 %93, %25
  br i1 %94, label %66, label %._crit_edge.loopexit

 ._crit_edge.loopexit:                             ; preds = %82
  %.pre = load <1 x i32>, ptr addrspace(3) %63, align 4
  br label %._crit_edge

 ._crit_edge:                                      ; preds = %._crit_edge.loopexit, %15
  %95 = phi <1 x i32> [ %.pre, %._crit_edge.loopexit ], [ zeroinitializer, %15 ]
  %96 = extractelement <1 x i32> %95, i64 0
  %97 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %98 = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %97)
  %99 = and i32 %98, -64
  %100 = add i32 %99, 64
  %101 = xor i32 %98, 1
  %102 = icmp slt i32 %101, %100
  %103 = select i1 %102, i32 %101, i32 %98
  %104 = shl i32 %103, 2
  %105 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %104, i32 %96)
  %106 = add i32 %105, %96
  %107 = xor i32 %98, 2
  %108 = icmp slt i32 %107, %100
  %109 = select i1 %108, i32 %107, i32 %98
  %110 = shl i32 %109, 2
  %111 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %110, i32 %106)
  %112 = add i32 %106, %111
  %113 = xor i32 %98, 4
  %114 = icmp slt i32 %113, %100
  %115 = select i1 %114, i32 %113, i32 %98
  %116 = shl i32 %115, 2
  %117 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %116, i32 %112)
  %118 = add i32 %112, %117
  %119 = xor i32 %98, 8
  %120 = icmp slt i32 %119, %100
  %121 = select i1 %120, i32 %119, i32 %98
  %122 = shl i32 %121, 2
  %123 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %122, i32 %118)
  %124 = add i32 %118, %123
  %125 = xor i32 %98, 16
  %126 = icmp slt i32 %125, %100
  %127 = select i1 %126, i32 %125, i32 %98
  %128 = shl i32 %127, 2
  %129 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %128, i32 %124)
  %130 = add i32 %124, %129
  %131 = xor i32 %98, 32
  %132 = icmp slt i32 %131, %100
  %133 = select i1 %132, i32 %131, i32 %98
  %134 = shl i32 %133, 2
  %135 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %134, i32 %130)
  %136 = icmp eq i32 %16, 0
  br i1 %136, label %137, label %146

 137:                                              ; preds = %._crit_edge
  %138 = add i32 %130, %135
  %139 = mul i32 %13, %62
  %140 = getelementptr i32, ptr addrspace(7) %60, i32 %139
  %141 = getelementptr i32, ptr addrspace(7) %140, i32 %61
  %142 = load <1 x i32>, ptr addrspace(7) %141, align 4
  %143 = extractelement <1 x i32> %142, i64 0
  %144 = add i32 %138, %143
  %145 = insertelement <1 x i32> poison, i32 %144, i64 0
  store <1 x i32> %145, ptr addrspace(7) %141, align 4
  br label %146

 146:                                              ; preds = %137, %._crit_edge
  ret void
 }

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
 declare void @llvm.assume(i1 noundef) #2

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i64 @llvm.umax.i64(i64, i64) #1

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) readnone, i16, i32, i32) #1

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i32 @llvm.amdgcn.workgroup.id.y() #1

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none)
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none)
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3

 ; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none)
 declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #4

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i64 @llvm.smin.i64(i64, i64) #1

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
 declare <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) captures(none), i32 immarg, <1 x i1>, <1 x i32>) #5

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
 declare void @llvm.masked.store.v1i32.p3(<1 x i32>, ptr addrspace(3) captures(none), i32 immarg, <1 x i1>) #6

 ; Function Attrs: alwaysinline mustprogress nocallback nofree nounwind willreturn
 declare void @llvm.amdgcn.s.waitcnt(i32 immarg) #7

 ; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn
 declare void @llvm.amdgcn.s.barrier() #8

 attributes #0 = { alwaysinline nofree norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 attributes #1 = { alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
 attributes #3 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
 attributes #4 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
 attributes #5 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
 attributes #6 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
 attributes #7 = { alwaysinline mustprogress nocallback nofree nounwind willreturn }
 attributes #8 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn }

 !llvm.module.flags = !{!0, !1}

 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
 !2 = !{i32 64, i32 1, i32 1}
	.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
	.amdhsa_code_object_version 5
	.text
	.globl matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
	.p2align 8
	.type matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32,@function
	matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32:
	s_load_dwordx2 s[2:3], s[0:1], 0x0
	s_load_dwordx8 s[4:11], s[0:1], 0x8
	s_load_dwordx4 s[12:15], s[0:1], 0x28
	s_waitcnt lgkmcnt(0)
	s_branch .LBB0_0
	.p2align 8
	.LBB0_0:
	s_mov_b64 s[28:29], s[2:3]
	s_mul_i32 s2, s10, s9
	s_mul_hi_u32 s3, s10, s8
	s_add_i32 s2, s3, s2
	s_mul_i32 s3, s11, s8
	s_add_i32 s3, s2, s3
	s_mul_i32 s2, s10, s8
	v_mov_b64_e32 v[6:7], s[10:11]
	v_cmp_gt_u64_e32 vcc, s[2:3], v[6:7]
	s_mov_b64 s[20:21], s[6:7]
	s_and_b64 s[6:7], vcc, exec
	s_load_dwordx4 s[24:27], s[0:1], 0x38
	s_cselect_b32 s30, s2, s10
	s_load_dwordx2 s[0:1], s[0:1], 0x38
	s_mul_i32 s2, s14, s13
	s_mul_hi_u32 s3, s14, s12
	v_mov_b32_e32 v3, s15
	s_add_i32 s2, s3, s2
	s_mul_i32 s15, s15, s12
	v_mov_b32_e32 v2, s14
	s_add_i32 s3, s2, s15
	s_mul_i32 s2, s14, s12
	v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
	s_and_b64 s[8:9], vcc, exec
	s_cselect_b32 s6, s2, s14
	s_waitcnt lgkmcnt(0)
	s_mul_i32 s1, s26, s1
	s_mul_hi_u32 s2, s26, s0
	s_add_i32 s1, s2, s1
	s_mul_i32 s2, s27, s0
	v_mov_b32_e32 v4, s26
	v_mov_b32_e32 v5, s27
	s_add_i32 s1, s1, s2
	s_mul_i32 s0, s26, s0
	v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
	s_and_b64 s[2:3], vcc, exec
	s_cselect_b32 s0, s0, s26
	s_lshl_b32 s22, s0, 2
	s_and_b32 s21, s21, 0xffff
	s_mov_b32 s7, 0x27000
	v_lshl_add_u32 v1, v0, 2, 0
	v_mov_b32_e32 v2, 0
	s_cmp_eq_u64 s[10:11], 0
	s_mov_b64 s[0:1], 0
	ds_write_b32 v1, v2
	s_cbranch_scc1 .LBB0_10
	s_and_b32 s29, s29, 0xffff
	s_and_b32 s5, s5, 0xffff
	s_mul_i32 s2, s17, s10
	s_lshl_b32 s12, s14, 6
	v_add_u32_e32 v4, s2, v0
	v_mul_lo_u32 v2, v0, s14
	s_add_u32 s2, 0, s10
	v_add_u32_e32 v5, s16, v2
	s_addc_u32 s3, s11, 0
	v_mov_b64_e32 v[2:3], s[10:11]
	s_mov_b32 s31, s7
	s_branch .LBB0_3
	.LBB0_2:
	s_or_b64 exec, exec, s[8:9]
	s_add_u32 s0, s0, 64
	s_addc_u32 s1, s1, 0
	s_add_u32 s2, s2, 0xffffffc0
	v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
	v_add_u32_e32 v4, 64, v4
	v_add_u32_e32 v5, s12, v5
	s_addc_u32 s3, s3, -1
	s_waitcnt lgkmcnt(0)
	s_cbranch_vccz .LBB0_9
	.LBB0_3:
	v_cmp_lt_i64_e64 s[8:9], s[2:3], 64
	s_and_b64 s[8:9], s[8:9], exec
	s_cselect_b32 s8, s2, 64
	s_waitcnt vmcnt(1)
	v_sub_u32_e32 v7, s8, v0
	s_waitcnt lgkmcnt(0)
	v_mov_b32_e32 v6, 0
	v_cmp_lt_i32_e32 vcc, 0, v7
	v_mov_b32_e32 v7, 0
	s_and_saveexec_b64 s[8:9], vcc
	s_cbranch_execnz .LBB0_6
	s_or_b64 exec, exec, s[8:9]
	buffer_load_ubyte v8, v4, s[28:31], 0 offen
	s_and_saveexec_b64 s[8:9], vcc
	s_cbranch_execnz .LBB0_7
	.LBB0_5:
	s_or_b64 exec, exec, s[8:9]
	s_and_saveexec_b64 s[8:9], vcc
	s_cbranch_execz .LBB0_2
	s_branch .LBB0_8
	.LBB0_6:
	buffer_load_sbyte v7, v5, s[4:7], 0 offen
	s_or_b64 exec, exec, s[8:9]
	buffer_load_ubyte v8, v4, s[28:31], 0 offen
	s_and_saveexec_b64 s[8:9], vcc
	s_cbranch_execz .LBB0_5
	.LBB0_7:
	ds_read_b32 v6, v1
	s_or_b64 exec, exec, s[8:9]
	s_and_saveexec_b64 s[8:9], vcc
	s_cbranch_execz .LBB0_2
	.LBB0_8:
	s_waitcnt vmcnt(0)
	v_bfe_i32 v8, v8, 0, 8
	v_cndmask_b32_e32 v8, 0, v8, vcc
	s_waitcnt lgkmcnt(0)
	v_mad_i32_i24 v6, v8, v7, v6
	ds_write_b32 v1, v6
	s_branch .LBB0_2
	.LBB0_9:
	ds_read_b32 v2, v1
	.LBB0_10:
	v_mbcnt_lo_u32_b32 v1, -1, 0
	v_mbcnt_hi_u32_b32 v3, -1, v1
	v_and_b32_e32 v1, 64, v3
	v_add_u32_e32 v4, 64, v1
	v_xor_b32_e32 v1, 1, v3
	v_cmp_lt_i32_e32 vcc, v1, v4
	v_xor_b32_e32 v5, 2, v3
	s_nop 0
	v_cndmask_b32_e32 v1, v3, v1, vcc
	v_lshlrev_b32_e32 v1, 2, v1
	s_waitcnt lgkmcnt(0)
	ds_bpermute_b32 v1, v1, v2
	v_cmp_lt_i32_e32 vcc, v5, v4
	s_waitcnt lgkmcnt(0)
	v_add_u32_e32 v1, v1, v2
	v_cndmask_b32_e32 v2, v3, v5, vcc
	v_lshlrev_b32_e32 v2, 2, v2
	ds_bpermute_b32 v2, v2, v1
	v_xor_b32_e32 v5, 4, v3
	v_cmp_lt_i32_e32 vcc, v5, v4
	s_waitcnt lgkmcnt(0)
	v_add_u32_e32 v1, v1, v2
	v_cndmask_b32_e32 v2, v3, v5, vcc
	v_lshlrev_b32_e32 v2, 2, v2
	ds_bpermute_b32 v2, v2, v1
	v_xor_b32_e32 v5, 8, v3
	v_cmp_lt_i32_e32 vcc, v5, v4
	s_waitcnt lgkmcnt(0)
	v_add_u32_e32 v1, v1, v2
	v_cndmask_b32_e32 v2, v3, v5, vcc
	v_lshlrev_b32_e32 v2, 2, v2
	ds_bpermute_b32 v2, v2, v1
	v_xor_b32_e32 v5, 16, v3
	v_cmp_lt_i32_e32 vcc, v5, v4
	s_waitcnt lgkmcnt(0)
	v_add_u32_e32 v1, v1, v2
	v_cndmask_b32_e32 v2, v3, v5, vcc
	v_lshlrev_b32_e32 v2, 2, v2
	ds_bpermute_b32 v2, v2, v1
	v_xor_b32_e32 v5, 32, v3
	v_cmp_lt_i32_e32 vcc, v5, v4
	s_waitcnt lgkmcnt(0)
	v_add_u32_e32 v1, v1, v2
	v_cndmask_b32_e32 v2, v3, v5, vcc
	v_lshlrev_b32_e32 v2, 2, v2
	ds_bpermute_b32 v2, v2, v1
	v_cmp_eq_u32_e32 vcc, 0, v0
	s_and_saveexec_b64 s[0:1], vcc
	s_cbranch_execz .LBB0_12
	s_mul_i32 s0, s26, s17
	s_lshl_b32 s0, s0, 2
	s_lshl_b32 s1, s16, 2
	s_add_i32 s0, s0, s1
	s_mov_b32 s23, s7
	v_mov_b32_e32 v0, s0
	buffer_load_dword v3, v0, s[20:23], 0 offen
	s_waitcnt vmcnt(0) lgkmcnt(0)
	v_add3_u32 v1, v1, v2, v3
	buffer_store_dword v1, v0, s[20:23], 0 offen
	.LBB0_12:
	s_endpgm
	.section .rodata,"a",@progbits
	.p2align 6, 0x0
	.amdhsa_kernel matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
	.amdhsa_group_segment_fixed_size 0
	.amdhsa_private_segment_fixed_size 0
	.amdhsa_kernarg_size 72
	.amdhsa_user_sgpr_count 16
	.amdhsa_user_sgpr_dispatch_ptr 0
	.amdhsa_user_sgpr_queue_ptr 0
	.amdhsa_user_sgpr_kernarg_segment_ptr 1
	.amdhsa_user_sgpr_dispatch_id 0
	.amdhsa_user_sgpr_kernarg_preload_length 14
	.amdhsa_user_sgpr_kernarg_preload_offset 0
	.amdhsa_user_sgpr_private_segment_size 0
	.amdhsa_uses_dynamic_stack 0
	.amdhsa_enable_private_segment 0
	.amdhsa_system_sgpr_workgroup_id_x 1
	.amdhsa_system_sgpr_workgroup_id_y 1
	.amdhsa_system_sgpr_workgroup_id_z 0
	.amdhsa_system_sgpr_workgroup_info 0
	.amdhsa_system_vgpr_workitem_id 0
	.amdhsa_next_free_vgpr 9
	.amdhsa_next_free_sgpr 32
	.amdhsa_accum_offset 12
	.amdhsa_reserve_vcc 1
	.amdhsa_reserve_xnack_mask 1
	.amdhsa_float_round_mode_32 0
	.amdhsa_float_round_mode_16_64 0
	.amdhsa_float_denorm_mode_32 3
	.amdhsa_float_denorm_mode_16_64 3
	.amdhsa_dx10_clamp 1
	.amdhsa_ieee_mode 1
	.amdhsa_fp16_overflow 0
	.amdhsa_tg_split 0
	.amdhsa_exception_fp_ieee_invalid_op 0
	.amdhsa_exception_fp_denorm_src 0
	.amdhsa_exception_fp_ieee_div_zero 0
	.amdhsa_exception_fp_ieee_overflow 0
	.amdhsa_exception_fp_ieee_underflow 0
	.amdhsa_exception_fp_ieee_inexact 0
	.amdhsa_exception_int_div_zero 0
	.end_amdhsa_kernel
	.text
	.Lfunc_end0:
	.size matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32, .Lfunc_end0-matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32

	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_vgpr, 9
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.num_agpr, 0
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.numbered_sgpr, 32
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.private_seg_size, 0
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_vcc, 1
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.uses_flat_scratch, 0
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_dyn_sized_stack, 0
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_recursion, 0
	.set matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.has_indirect_call, 0
	.p2alignl 6, 3212836864
	.fill 256, 4, 3212836864
	.section .AMDGPU.gpr_maximums,"",@progbits
	.set amdgpu.max_num_vgpr, 0
	.set amdgpu.max_num_agpr, 0
	.set amdgpu.max_num_sgpr, 0
	.text
	.section ".note.GNU-stack","",@progbits
	.amdgpu_metadata
	---
	amdhsa.kernels:
	- .agpr_count: 0
	.args:
	- .actual_access: read_only
	.address_space: global
	.offset: 0
	.size: 8
	.value_kind: global_buffer
	- .actual_access: read_only
	.address_space: global
	.offset: 8
	.size: 8
	.value_kind: global_buffer
	- .address_space: global
	.offset: 16
	.size: 8
	.value_kind: global_buffer
	- .offset: 24
	.size: 4
	.value_kind: by_value
	- .offset: 28
	.size: 4
	.value_kind: by_value
	- .offset: 32
	.size: 4
	.value_kind: by_value
	- .offset: 36
	.size: 4
	.value_kind: by_value
	- .offset: 40
	.size: 4
	.value_kind: by_value
	- .offset: 44
	.size: 4
	.value_kind: by_value
	- .offset: 48
	.size: 4
	.value_kind: by_value
	- .offset: 52
	.size: 4
	.value_kind: by_value
	- .offset: 56
	.size: 4
	.value_kind: by_value
	- .offset: 60
	.size: 4
	.value_kind: by_value
	- .offset: 64
	.size: 4
	.value_kind: by_value
	- .offset: 68
	.size: 4
	.value_kind: by_value
	.group_segment_fixed_size: 0
	.kernarg_segment_align: 8
	.kernarg_segment_size: 72
	.max_flat_workgroup_size: 64
	.name: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32
	.private_segment_fixed_size: 0
	.reqd_workgroup_size:
	- 64
	- 1
	- 1
	.sgpr_count: 38
	.sgpr_spill_count: 0
	.symbol: matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32.kd
	.uniform_work_group_size: 1
	.uses_dynamic_stack: false
	.vgpr_count: 9
	.vgpr_spill_count: 0
	.wavefront_size: 64
	amdhsa.target: amdgcn-amd-amdhsa--gfx942
	amdhsa.version:
	- 1
	- 2
	...

	.end_amdgpu_metadata

	; To reproduce the .optimized.ll from the .linked.ll, run:
	; opt --passes='verify,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>),openmp-opt,amdgpu-printf-runtime-binding,amdgpu-unify-metadata,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>),always-inline,require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline,function-attrs<skip-non-recursive-function-attrs>,openmp-opt-cgscc,function(amdgpu-promote-kernel-arguments,infer-address-spaces,amdgpu-lower-kernel-attributes,amdgpu-promote-alloca-to-vector),function<eager-inv;no-rerun>(sroa<modify-cfg>,early-cse<memssa>,speculative-execution<only-if-divergent-target>,jump-threading,correlated-propagation,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine,libcalls-shrinkwrap,amdgpu-usenative,amdgpu-simplifylib,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,reassociate,constraint-elimination,loop-mssa(loop-instsimplify,loop-simplifycfg,licm<no-allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>,licm<allowspeculation>,simple-loop-unswitch<no-nontrivial;trivial>),simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,loop(loop-idiom,indvars,extra-simple-loop-unswitch-passes,loop-deletion,loop-unroll-full),sroa<modify-cfg>,vector-combine,mldst-motion<no-split-footer-bb>,gvn<>,sccp,bdce,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,jump-threading,correlated-propagation,adce,memcpyopt,dse,move-auto-init,loop-mssa(licm<allowspeculation>),coro-elide,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib),function-attrs,function(require<should-not-run-function-passes>),coro-split,coro-annotation-elide)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,function<eager-inv>(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion),loop-distribute,inject-tli-mappings,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-load-elim,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,slp-vectorizer,vector-combine,instcombine<max-iterations=1;no-verify-fixpoint>,loop-unroll<O2>,transform-warning,sroa<preserve-cfg>,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,loop-sink,instsimplify,div-rem-pairs,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),amdgpu-attributor,globaldce,constmerge,cg-profile,rel-lookup-table-converter,function(annotation-remarks),verify'

	; ModuleID = 'matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0'
	source_filename = "matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0"
	target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"

	@__dynamic_shared_memory__ = external addrspace(3) global [0 x i8], align 16

	; Function Attrs: alwaysinline nofree norecurse nounwind
	define amdgpu_kernel void @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_dispatch_0_matmul_DxDxD_i8xi8xi32(ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %0, ptr addrspace(1) inreg noalias noundef nonnull readonly align 16 %1, ptr addrspace(1) inreg noalias noundef nonnull align 16 %2, i32 inreg noundef %3, i32 inreg noundef %4, i32 inreg noundef %5, i32 inreg noundef %6, i32 inreg noundef %7, i32 inreg noundef %8, i32 inreg noundef %9, i32 inreg noundef %10, i32 inreg noundef %11, i32 inreg noundef %12, i32 inreg noundef %13, i32 inreg noundef %14) local_unnamed_addr #0 !reqd_work_group_size !2 {
	%16 = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
	%17 = zext nneg i32 %16 to i64
	%18 = zext i32 %3 to i64
	%19 = zext i32 %4 to i64
	%20 = shl nuw i64 %19, 32
	%21 = or disjoint i64 %20, %18
	%22 = zext i32 %5 to i64
	%23 = zext i32 %6 to i64
	%24 = shl nuw i64 %23, 32
	%25 = or disjoint i64 %24, %22
	%26 = zext i32 %7 to i64
	%27 = zext i32 %8 to i64
	%28 = shl nuw i64 %27, 32
	%29 = or disjoint i64 %28, %26
	%30 = zext i32 %9 to i64
	%31 = zext i32 %10 to i64
	%32 = shl nuw i64 %31, 32
	%33 = or disjoint i64 %32, %30
	%34 = zext i32 %11 to i64
	%35 = zext i32 %12 to i64
	%36 = shl nuw i64 %35, 32
	%37 = or disjoint i64 %36, %34
	%38 = zext i32 %13 to i64
	%39 = zext nneg i32 %14 to i64
	%40 = shl nuw nsw i64 %39, 32
	%41 = or disjoint i64 %40, %38
	%42 = icmp ult i32 %4, 2097152
	tail call void @llvm.assume(i1 %42)
	%43 = icmp ult i32 %6, 2097152
	tail call void @llvm.assume(i1 %43)
	%44 = icmp ult i32 %8, 2097152
	tail call void @llvm.assume(i1 %44)
	%45 = icmp ult i32 %10, 2097152
	tail call void @llvm.assume(i1 %45)
	%46 = icmp ult i32 %12, 2097152
	tail call void @llvm.assume(i1 %46)
	%47 = icmp ult i32 %14, 2097152
	tail call void @llvm.assume(i1 %47)
	%48 = mul i64 %25, %21
	%49 = tail call i64 @llvm.umax.i64(i64 %48, i64 %25)
	%50 = trunc i64 %49 to i32
	%51 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %0, i16 0, i32 %50, i32 159744)
	call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %51, i64 64) ]
	%52 = mul i64 %33, %29
	%53 = tail call i64 @llvm.umax.i64(i64 %52, i64 %33)
	%54 = trunc i64 %53 to i32
	%55 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %1, i16 0, i32 %54, i32 159744)
	call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %55, i64 64) ]
	%56 = mul i64 %41, %37
	%57 = tail call i64 @llvm.umax.i64(i64 %56, i64 %41)
	%58 = trunc i64 %57 to i32
	%59 = shl i32 %58, 2
	%60 = tail call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nonnull %2, i16 0, i32 %59, i32 159744)
	call void @llvm.assume(i1 true) [ "align"(ptr addrspace(7) %60, i64 64) ]
	%61 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.x()
	%62 = tail call range(i32 0, 2147483647) i32 @llvm.amdgcn.workgroup.id.y()
	%63 = getelementptr i32, ptr addrspace(3) @__dynamic_shared_memory__, i32 %16
	store <1 x i32> zeroinitializer, ptr addrspace(3) %63, align 4
	%invariant.gep = getelementptr i8, ptr addrspace(7) %55, i32 %61
	%.not = icmp eq i64 %25, 0
	br i1 %.not, label %._crit_edge, label %.lr.ph

	.lr.ph: ; preds = %15
	%64 = zext nneg i32 %62 to i64
	%65 = mul nuw nsw i64 %22, %64
	br label %66

	66: ; preds = %.lr.ph, %82
	%67 = phi i64 [ 0, %.lr.ph ], [ %93, %82 ]
	%68 = sub nuw nsw i64 %25, %67
	%69 = tail call i64 @llvm.smin.i64(i64 %68, i64 64)
	%70 = trunc nuw nsw i64 %69 to i32
	%71 = sub i32 %70, %16
	%72 = insertelement <1 x i32> poison, i32 %71, i64 0
	%73 = icmp sgt <1 x i32> %72, zeroinitializer
	%74 = or disjoint i64 %67, %17
	%75 = extractelement <1 x i1> %73, i64 0
	br i1 %75, label %76, label %82

	76: ; preds = %66
	%77 = trunc i64 %74 to i32
	%78 = mul i32 %9, %77
	%gep = getelementptr i8, ptr addrspace(7) %invariant.gep, i32 %78
	%79 = load i8, ptr addrspace(7) %gep, align 1
	%80 = insertelement <1 x i8> poison, i8 %79, i64 0
	%81 = sext <1 x i8> %80 to <1 x i32>
	br label %82

	82: ; preds = %66, %76
	%83 = phi <1 x i32> [ %81, %76 ], [ zeroinitializer, %66 ]
	%84 = add nuw i64 %74, %65
	%85 = trunc i64 %84 to i32
	%86 = getelementptr i8, ptr addrspace(7) %51, i32 %85
	%87 = load <1 x i8>, ptr addrspace(7) %86, align 1
	%88 = sext <1 x i8> %87 to <1 x i32>
	%89 = select <1 x i1> %73, <1 x i32> %88, <1 x i32> zeroinitializer
	%90 = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) %63, i32 4, <1 x i1> %73, <1 x i32> zeroinitializer)
	%91 = mul nsw <1 x i32> %89, %83
	%92 = add <1 x i32> %91, %90
	tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %92, ptr addrspace(3) %63, i32 4, <1 x i1> %73)
	tail call void @llvm.amdgcn.s.waitcnt(i32 -7937)
	tail call void @llvm.amdgcn.s.barrier()
	%93 = add nuw nsw i64 %67, 64
	%94 = icmp samesign ult i64 %93, %25
	br i1 %94, label %66, label %._crit_edge.loopexit

	._crit_edge.loopexit: ; preds = %82
	%.pre = load <1 x i32>, ptr addrspace(3) %63, align 4
	br label %._crit_edge

	._crit_edge: ; preds = %._crit_edge.loopexit, %15
	%95 = phi <1 x i32> [ %.pre, %._crit_edge.loopexit ], [ zeroinitializer, %15 ]
	%96 = extractelement <1 x i32> %95, i64 0
	%97 = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
	%98 = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %97)
	%99 = and i32 %98, -64
	%100 = add i32 %99, 64
	%101 = xor i32 %98, 1
	%102 = icmp slt i32 %101, %100
	%103 = select i1 %102, i32 %101, i32 %98
	%104 = shl i32 %103, 2
	%105 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %104, i32 %96)
	%106 = add i32 %105, %96
	%107 = xor i32 %98, 2
	%108 = icmp slt i32 %107, %100
	%109 = select i1 %108, i32 %107, i32 %98
	%110 = shl i32 %109, 2
	%111 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %110, i32 %106)
	%112 = add i32 %106, %111
	%113 = xor i32 %98, 4
	%114 = icmp slt i32 %113, %100
	%115 = select i1 %114, i32 %113, i32 %98
	%116 = shl i32 %115, 2
	%117 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %116, i32 %112)
	%118 = add i32 %112, %117
	%119 = xor i32 %98, 8
	%120 = icmp slt i32 %119, %100
	%121 = select i1 %120, i32 %119, i32 %98
	%122 = shl i32 %121, 2
	%123 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %122, i32 %118)
	%124 = add i32 %118, %123
	%125 = xor i32 %98, 16
	%126 = icmp slt i32 %125, %100
	%127 = select i1 %126, i32 %125, i32 %98
	%128 = shl i32 %127, 2
	%129 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %128, i32 %124)
	%130 = add i32 %124, %129
	%131 = xor i32 %98, 32
	%132 = icmp slt i32 %131, %100
	%133 = select i1 %132, i32 %131, i32 %98
	%134 = shl i32 %133, 2
	%135 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %134, i32 %130)
	%136 = icmp eq i32 %16, 0
	br i1 %136, label %137, label %146

	137: ; preds = %._crit_edge
	%138 = add i32 %130, %135
	%139 = mul i32 %13, %62
	%140 = getelementptr i32, ptr addrspace(7) %60, i32 %139
	%141 = getelementptr i32, ptr addrspace(7) %140, i32 %61
	%142 = load <1 x i32>, ptr addrspace(7) %141, align 4
	%143 = extractelement <1 x i32> %142, i64 0
	%144 = add i32 %138, %143
	%145 = insertelement <1 x i32> poison, i32 %144, i64 0
	store <1 x i32> %145, ptr addrspace(7) %141, align 4
	br label %146

	146: ; preds = %137, %._crit_edge
	ret void
	}

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
	declare void @llvm.assume(i1 noundef) #2

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare i64 @llvm.umax.i64(i64, i64) #1

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) readnone, i16, i32, i32) #1

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare noundef i32 @llvm.amdgcn.workgroup.id.y() #1

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none)
	declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none)
	declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3

	; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none)
	declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #4

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare i64 @llvm.smin.i64(i64, i64) #1

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
	declare <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) captures(none), i32 immarg, <1 x i1>, <1 x i32>) #5

	; Function Attrs: alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
	declare void @llvm.masked.store.v1i32.p3(<1 x i32>, ptr addrspace(3) captures(none), i32 immarg, <1 x i1>) #6

	; Function Attrs: alwaysinline mustprogress nocallback nofree nounwind willreturn
	declare void @llvm.amdgcn.s.waitcnt(i32 immarg) #7

	; Function Attrs: alwaysinline convergent mustprogress nocallback nofree nounwind willreturn
	declare void @llvm.amdgcn.s.barrier() #8

	attributes #0 = { alwaysinline nofree norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
	attributes #1 = { alwaysinline mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
	attributes #2 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
	attributes #3 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
	attributes #4 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
	attributes #5 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
	attributes #6 = { alwaysinline mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
	attributes #7 = { alwaysinline mustprogress nocallback nofree nounwind willreturn }
	attributes #8 = { alwaysinline convergent mustprogress nocallback nofree nounwind willreturn }

	!llvm.module.flags = !{!0, !1}

	!0 = !{i32 2, !"Debug Info Version", i32 3}
	!1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
	!2 = !{i32 64, i32 1, i32 1}