diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index efb2894aaf642..70e80f9fe14ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -47,10 +47,10 @@ static cl::opt WidenLoads( cl::init(false)); static cl::opt Widen16BitOps( - "amdgpu-codegenprepare-widen-16-bit-ops", - cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, - cl::init(true)); + "amdgpu-codegenprepare-widen-16-bit-ops", + cl::desc( + "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); static cl::opt BreakLargePHIs("amdgpu-codegenprepare-break-large-phis", diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index c8b82716a9fe1..814acc3be1fc0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -281,12 +281,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -323,12 +323,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -365,12 +365,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, 4 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi: @@ -408,14 +408,13 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16: @@ -461,14 +460,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs: @@ -517,14 +515,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_rhs: @@ -580,14 +577,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 82d87358e1faf..aea32b3fedba7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -70,30 +70,15 @@ define i8 @v_ashr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) { -; GFX6-LABEL: s_ashr_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i8 s0, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_ashr_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_sext_i32_i8 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_ashr_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_ashr_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_sext_i32_i8 s0, s0 +; GCN-NEXT: s_ashr_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i8: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_sext_i32_i8 s1, s1 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i8 %value, %amount @@ -642,30 +627,15 @@ define i16 @v_ashr_i16_15(i16 %value) { } define amdgpu_ps i16 @s_ashr_i16(i16 inreg %value, i16 inreg %amount) { -; GFX6-LABEL: s_ashr_i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_ashr_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_ashr_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_ashr_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: s_ashr_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_sext_i32_i16 s1, s1 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount @@ -826,14 +796,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_ashr_i32 s2, s2, s3 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s1, s2 +; GFX8-NEXT: s_ashr_i32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1028,23 +999,25 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s6, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s7, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 -; GFX8-NEXT: s_ashr_i32 s4, s4, s6 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_ashr_i32 s2, s5, s7 +; GFX8-NEXT: s_sext_i32_i16 s2, s4 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_ashr_i32 s2, s2, s6 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s3, s4, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s5 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1235,41 +1208,45 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s9, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s12, s4 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s13, s5 -; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s10, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s14, s6 -; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s9, s13 +; GFX8-NEXT: s_sext_i32_i16 s4, s8 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_ashr_i32 s4, s4, s12 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s11, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s15, s7 -; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 -; GFX8-NEXT: s_ashr_i32 s5, s10, s14 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_ashr_i32 s5, s5, s13 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: s_ashr_i32 s8, s8, s12 -; GFX8-NEXT: s_ashr_i32 s6, s11, s15 +; GFX8-NEXT: s_sext_i32_i16 s6, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_ashr_i32 s6, s6, s14 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s4, s5, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s7, s8, 0xffff +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_ashr_i32 s7, s7, s15 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s4, s6, 0xffff -; GFX8-NEXT: s_or_b32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 7fa0d23e55938..be1dc7f0c67f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -85,14 +85,27 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { } define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { -; GCN-LABEL: s_lshr_i8_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i8_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i8_7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshr_b32 s0, s0, 7 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i8_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshr_b32 s0, s0, 7 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8_7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 ret i8 %result @@ -619,15 +632,27 @@ define i16 @v_lshr_i16_15(i16 %value) { } define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { -; GCN-LABEL: s_lshr_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount @@ -635,14 +660,27 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { } define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) { -; GCN-LABEL: s_lshr_i16_15: -; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 15 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 15 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, 15 ret i16 %result @@ -783,13 +821,12 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v2i16: @@ -970,21 +1007,19 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, s6 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v4i16: @@ -1155,37 +1190,33 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s4, s8, s12 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s5, s9, s13 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s10, s14 -; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 ; GFX8-NEXT: s_lshr_b32 s7, s11, s15 -; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 455446aa38c60..f9cb584d27ecc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -8,37 +8,18 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { -; GFX7-LABEL: s_mul_i16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mul_i32 s0, s0, s1 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_mul_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_mul_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_mul_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -106,35 +87,27 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_zeroext: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16_zeroext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result @@ -197,42 +170,22 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { } define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) { -; GFX7-LABEL: s_mul_i16_signext: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mul_i32 s0, s0, s1 -; GFX7-NEXT: s_sext_i32_i16 s0, s0 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_mul_i16_signext: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_mul_i16_signext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_mul_i16_signext: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_signext: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16_signext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_sext_i32_i16 s0, s0 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index 131970148ed05..46b75eb55cb52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -40,30 +40,14 @@ define i8 @v_sext_inreg_i8_7(i8 %value) { } define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 3 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 3 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x50000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x50000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i8 %value, 3 %ashr = ashr i8 %shl, 3 @@ -71,30 +55,14 @@ define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { } define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i8_6: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i8_6: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 6 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 6 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i8_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 6 -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 6 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i8_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x20000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8_6: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6 -; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x20000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i8 %value, 6 %ashr = ashr i8 %shl, 6 @@ -545,30 +513,14 @@ define i16 @v_sext_inreg_i16_15(i16 %value) { } define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i16_9: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x70000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i16_9: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 9 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i16_9: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 9 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i16_9: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x70000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_9: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x70000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i16 %value, 9 %ashr = ashr i16 %shl, 9 @@ -576,30 +528,14 @@ define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) { } define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i16_15: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i16_15: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 15 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 15 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i16_15: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 15 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 15 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i16_15: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x10000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 -; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i16 %value, 15 %ashr = ashr i16 %shl, 15 @@ -690,15 +626,11 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) { ; ; GFX8-LABEL: s_sext_inreg_v2i16_11: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 11 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, 11 -; GFX8-NEXT: s_ashr_i32 s1, s1, 11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_bfe_i32 s1, s0, 0x50000 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x50010 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -823,25 +755,17 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) { ; ; GFX8-LABEL: s_sext_inreg_v4i16_14: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 14 -; GFX8-NEXT: s_lshl_b32 s2, s2, 14 -; GFX8-NEXT: s_lshl_b32 s1, s1, 14 -; GFX8-NEXT: s_lshl_b32 s3, s3, 14 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_ashr_i32 s0, s0, 14 -; GFX8-NEXT: s_ashr_i32 s2, s2, 14 -; GFX8-NEXT: s_ashr_i32 s1, s1, 14 -; GFX8-NEXT: s_ashr_i32 s3, s3, 14 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_bfe_i32 s2, s0, 0x20000 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x20010 +; GFX8-NEXT: s_bfe_i32 s3, s1, 0x20000 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x20010 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s2, s3, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s2, s3, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1036,45 +960,29 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) { ; ; GFX8-LABEL: s_sext_inreg_v8i16_5: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 5 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 5 -; GFX8-NEXT: s_lshl_b32 s5, s5, 5 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, 5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 5 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_ashr_i32 s0, s0, 5 -; GFX8-NEXT: s_ashr_i32 s4, s4, 5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 5 -; GFX8-NEXT: s_lshl_b32 s7, s7, 5 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_ashr_i32 s1, s1, 5 -; GFX8-NEXT: s_ashr_i32 s5, s5, 5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0xb0010 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0xb0010 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_ashr_i32 s2, s2, 5 -; GFX8-NEXT: s_ashr_i32 s6, s6, 5 -; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_lshl_b32 s4, s5, 16 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0xb0010 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_ashr_i32 s3, s3, 5 -; GFX8-NEXT: s_ashr_i32 s7, s7, 5 -; GFX8-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NEXT: s_lshl_b32 s4, s6, 16 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0xb0010 +; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s4, s5, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_lshl_b32 s4, s7, 16 +; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s4, s7, 0xffff +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 993d0f76ea10e..0806eecbcc1dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -617,13 +617,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; ; GFX8-LABEL: s_shl_v2i32_zext_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x3fff -; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v2i32_zext_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 77917377f1cd6..139652eb55e3d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -64,26 +64,13 @@ define i8 @v_shl_i8_7(i8 %value) { } define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) { -; GFX6-LABEL: s_shl_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_shl_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_shl_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_shl_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount @@ -620,26 +607,13 @@ define i16 @v_shl_i16_15(i16 %value) { } define amdgpu_ps i16 @s_shl_i16(i16 inreg %value, i16 inreg %amount) { -; GFX6-LABEL: s_shl_i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_shl_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_shl_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_shl_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount @@ -791,13 +765,13 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v2i16: @@ -976,21 +950,21 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s4, s6 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v4i16: @@ -1157,37 +1131,37 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s8, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshl_b32 s5, s9, s13 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s6, s10, s14 -; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 ; GFX8-NEXT: s_lshl_b32 s7, s11, s15 -; GFX8-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll index 6c104709f5ee3..9aa393ee137d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll @@ -244,12 +244,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 -; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, 64 +; GFX8-NEXT: s_add_i32 s0, s0, 64 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat: @@ -284,12 +284,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 -; GFX8-NEXT: s_add_i32 s1, s1, -4 +; GFX8-NEXT: s_add_i32 s1, s1, 0xfffc +; GFX8-NEXT: s_add_i32 s0, s0, 64 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo: @@ -324,12 +324,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, -4 -; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, 64 +; GFX8-NEXT: s_add_i32 s0, s0, 0xfffc +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi: @@ -365,14 +365,13 @@ define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX8-LABEL: s_sub_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16: @@ -412,14 +411,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_lhs: @@ -463,14 +461,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_rhs: @@ -516,14 +513,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 17b6f5072116d..7d7452485fdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -35,15 +35,8 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; ; GFX8-LABEL: scalar_xnor_v2i16_one_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-NEXT: s_mov_b32 s3, s2 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_xor_b32 s0, s0, -1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: @@ -127,21 +120,8 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX8-LABEL: scalar_xnor_v4i16_one_use: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, 0xffff -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_and_b32 s6, s1, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 0deddfb8d7310..50d20e9b0e4d7 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -102,13 +102,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_add_i32 s0, s2, s3 ; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s0, s0, s1 -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_add_i32 s2, s2, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshl_b32 s1, s2, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -167,16 +167,15 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_add_i32 s0, s2, s2 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -225,12 +224,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 ; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s5, s4 ; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_lshl_b32 s3, s5, 16 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 9d185ec8113aa..fa73ef0b0ec4c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -19673,255 +19673,277 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -19930,223 +19952,208 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v52, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v40, v21 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v58, v2 -; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -20177,535 +20184,441 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: v_mov_b32_e32 v44, v24 -; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 ; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: v_mov_b32_e32 v52, v48 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_or_b32_e32 v30, v30, v31 -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v32 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 -; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_e32 v26, v26, v33 -; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v28, v28, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v29, v29, v33 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB15_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -38367,201 +38280,190 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v32, s30, 0 -; VI-NEXT: v_writelane_b32 v32, s31, 1 -; VI-NEXT: v_writelane_b32 v32, s34, 2 -; VI-NEXT: v_writelane_b32 v32, s35, 3 -; VI-NEXT: v_writelane_b32 v32, s36, 4 -; VI-NEXT: v_writelane_b32 v32, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v32, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s47, v2 -; VI-NEXT: v_readfirstlane_b32 s46, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s44, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s42, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s40, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s14, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s12, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s10, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s8, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s6, v1 -; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: v_readfirstlane_b32 s47, v1 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s47, 3 -; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 -; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 -; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s14, s14, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s40, s40, 0xffff -; VI-NEXT: s_and_b32 s41, s41, 0xffff -; VI-NEXT: s_and_b32 s42, s42, 0xffff -; VI-NEXT: s_and_b32 s43, s43, 0xffff -; VI-NEXT: s_and_b32 s44, s44, 0xffff -; VI-NEXT: s_and_b32 s45, s45, 0xffff -; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s39, s6 -; VI-NEXT: s_or_b32 s7, s38, s7 -; VI-NEXT: s_or_b32 s29, s37, s29 -; VI-NEXT: s_or_b32 s28, s36, s28 -; VI-NEXT: s_or_b32 s27, s35, s27 -; VI-NEXT: s_or_b32 s26, s34, s26 -; VI-NEXT: s_or_b32 s25, s31, s25 -; VI-NEXT: s_or_b32 s24, s30, s24 -; VI-NEXT: s_or_b32 s23, vcc_hi, s23 -; VI-NEXT: s_or_b32 s22, vcc_lo, s22 -; VI-NEXT: s_or_b32 s21, s91, s21 -; VI-NEXT: s_or_b32 s20, s90, s20 -; VI-NEXT: s_or_b32 s19, s89, s19 -; VI-NEXT: s_or_b32 s18, s88, s18 -; VI-NEXT: s_or_b32 s17, s79, s17 -; VI-NEXT: s_or_b32 s16, s78, s16 -; VI-NEXT: s_or_b32 s8, s77, s8 -; VI-NEXT: s_or_b32 s9, s76, s9 -; VI-NEXT: s_or_b32 s10, s75, s10 -; VI-NEXT: s_or_b32 s11, s74, s11 -; VI-NEXT: s_or_b32 s12, s73, s12 -; VI-NEXT: s_or_b32 s13, s72, s13 -; VI-NEXT: s_or_b32 s14, s63, s14 -; VI-NEXT: s_or_b32 s15, s62, s15 -; VI-NEXT: s_or_b32 s40, s61, s40 -; VI-NEXT: s_or_b32 s41, s60, s41 -; VI-NEXT: s_or_b32 s42, s59, s42 -; VI-NEXT: s_or_b32 s43, s58, s43 -; VI-NEXT: s_or_b32 s44, s57, s44 -; VI-NEXT: s_or_b32 s45, s56, s45 -; VI-NEXT: s_or_b32 s46, s47, s46 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 -; VI-NEXT: s_add_i32 s40, s40, 0x30000 -; VI-NEXT: s_add_i32 s41, s41, 0x30000 -; VI-NEXT: s_add_i32 s42, s42, 0x30000 -; VI-NEXT: s_add_i32 s43, s43, 0x30000 -; VI-NEXT: s_add_i32 s44, s44, 0x30000 -; VI-NEXT: s_add_i32 s45, s45, 0x30000 -; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s5, s46, 3 ; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -38577,36 +38479,24 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s7 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s47 -; VI-NEXT: v_mov_b32_e32 v17, s46 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: v_mov_b32_e32 v20, s43 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v23, s40 -; VI-NEXT: v_mov_b32_e32 v24, s15 -; VI-NEXT: v_mov_b32_e32 v25, s14 -; VI-NEXT: v_mov_b32_e32 v26, s13 -; VI-NEXT: v_mov_b32_e32 v27, s12 -; VI-NEXT: v_mov_b32_e32 v28, s11 -; VI-NEXT: v_mov_b32_e32 v29, s10 -; VI-NEXT: v_mov_b32_e32 v30, s9 -; VI-NEXT: v_mov_b32_e32 v31, s8 -; VI-NEXT: v_readlane_b32 s39, v32, 7 -; VI-NEXT: v_readlane_b32 s38, v32, 6 -; VI-NEXT: v_readlane_b32 s37, v32, 5 -; VI-NEXT: v_readlane_b32 s36, v32, 4 -; VI-NEXT: v_readlane_b32 s35, v32, 3 -; VI-NEXT: v_readlane_b32 s34, v32, 2 -; VI-NEXT: v_readlane_b32 s31, v32, 1 -; VI-NEXT: v_readlane_b32 s30, v32, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: ; VI-NEXT: s_branch .LBB27_2 @@ -58500,255 +58390,277 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -58757,223 +58669,208 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v52, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v40, v21 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v58, v2 -; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -59004,535 +58901,441 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: v_mov_b32_e32 v44, v24 -; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 ; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: v_mov_b32_e32 v52, v48 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB39_3: ; %Flow -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB39_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_or_b32_e32 v30, v30, v31 -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v32 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 -; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_e32 v26, v26, v33 -; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v28, v28, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v29, v29, v33 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB39_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -77193,201 +76996,190 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v32, s30, 0 -; VI-NEXT: v_writelane_b32 v32, s31, 1 -; VI-NEXT: v_writelane_b32 v32, s34, 2 -; VI-NEXT: v_writelane_b32 v32, s35, 3 -; VI-NEXT: v_writelane_b32 v32, s36, 4 -; VI-NEXT: v_writelane_b32 v32, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v32, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s47, v2 -; VI-NEXT: v_readfirstlane_b32 s46, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s44, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s42, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s40, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s14, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s12, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s10, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s8, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s6, v1 -; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: v_readfirstlane_b32 s47, v1 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s47, 3 -; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 -; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 -; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s14, s14, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s40, s40, 0xffff -; VI-NEXT: s_and_b32 s41, s41, 0xffff -; VI-NEXT: s_and_b32 s42, s42, 0xffff -; VI-NEXT: s_and_b32 s43, s43, 0xffff -; VI-NEXT: s_and_b32 s44, s44, 0xffff -; VI-NEXT: s_and_b32 s45, s45, 0xffff -; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s39, s6 -; VI-NEXT: s_or_b32 s7, s38, s7 -; VI-NEXT: s_or_b32 s29, s37, s29 -; VI-NEXT: s_or_b32 s28, s36, s28 -; VI-NEXT: s_or_b32 s27, s35, s27 -; VI-NEXT: s_or_b32 s26, s34, s26 -; VI-NEXT: s_or_b32 s25, s31, s25 -; VI-NEXT: s_or_b32 s24, s30, s24 -; VI-NEXT: s_or_b32 s23, vcc_hi, s23 -; VI-NEXT: s_or_b32 s22, vcc_lo, s22 -; VI-NEXT: s_or_b32 s21, s91, s21 -; VI-NEXT: s_or_b32 s20, s90, s20 -; VI-NEXT: s_or_b32 s19, s89, s19 -; VI-NEXT: s_or_b32 s18, s88, s18 -; VI-NEXT: s_or_b32 s17, s79, s17 -; VI-NEXT: s_or_b32 s16, s78, s16 -; VI-NEXT: s_or_b32 s8, s77, s8 -; VI-NEXT: s_or_b32 s9, s76, s9 -; VI-NEXT: s_or_b32 s10, s75, s10 -; VI-NEXT: s_or_b32 s11, s74, s11 -; VI-NEXT: s_or_b32 s12, s73, s12 -; VI-NEXT: s_or_b32 s13, s72, s13 -; VI-NEXT: s_or_b32 s14, s63, s14 -; VI-NEXT: s_or_b32 s15, s62, s15 -; VI-NEXT: s_or_b32 s40, s61, s40 -; VI-NEXT: s_or_b32 s41, s60, s41 -; VI-NEXT: s_or_b32 s42, s59, s42 -; VI-NEXT: s_or_b32 s43, s58, s43 -; VI-NEXT: s_or_b32 s44, s57, s44 -; VI-NEXT: s_or_b32 s45, s56, s45 -; VI-NEXT: s_or_b32 s46, s47, s46 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 -; VI-NEXT: s_add_i32 s40, s40, 0x30000 -; VI-NEXT: s_add_i32 s41, s41, 0x30000 -; VI-NEXT: s_add_i32 s42, s42, 0x30000 -; VI-NEXT: s_add_i32 s43, s43, 0x30000 -; VI-NEXT: s_add_i32 s44, s44, 0x30000 -; VI-NEXT: s_add_i32 s45, s45, 0x30000 -; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s5, s46, 3 ; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -77403,36 +77195,24 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s7 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s47 -; VI-NEXT: v_mov_b32_e32 v17, s46 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: v_mov_b32_e32 v20, s43 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v23, s40 -; VI-NEXT: v_mov_b32_e32 v24, s15 -; VI-NEXT: v_mov_b32_e32 v25, s14 -; VI-NEXT: v_mov_b32_e32 v26, s13 -; VI-NEXT: v_mov_b32_e32 v27, s12 -; VI-NEXT: v_mov_b32_e32 v28, s11 -; VI-NEXT: v_mov_b32_e32 v29, s10 -; VI-NEXT: v_mov_b32_e32 v30, s9 -; VI-NEXT: v_mov_b32_e32 v31, s8 -; VI-NEXT: v_readlane_b32 s39, v32, 7 -; VI-NEXT: v_readlane_b32 s38, v32, 6 -; VI-NEXT: v_readlane_b32 s37, v32, 5 -; VI-NEXT: v_readlane_b32 s36, v32, 4 -; VI-NEXT: v_readlane_b32 s35, v32, 3 -; VI-NEXT: v_readlane_b32 s34, v32, 2 -; VI-NEXT: v_readlane_b32 s31, v32, 1 -; VI-NEXT: v_readlane_b32 s30, v32, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: ; VI-NEXT: s_branch .LBB51_2 @@ -95346,255 +95126,277 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -95603,223 +95405,208 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v52, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v40, v21 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v58, v2 -; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -95850,535 +95637,441 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: v_mov_b32_e32 v44, v24 -; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 ; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: v_mov_b32_e32 v52, v48 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB59_3: ; %Flow -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB59_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_or_b32_e32 v30, v30, v31 -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v32 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 -; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_e32 v26, v26, v33 -; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v28, v28, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v29, v29, v33 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB59_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -114055,201 +113748,190 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v32, s30, 0 -; VI-NEXT: v_writelane_b32 v32, s31, 1 -; VI-NEXT: v_writelane_b32 v32, s34, 2 -; VI-NEXT: v_writelane_b32 v32, s35, 3 -; VI-NEXT: v_writelane_b32 v32, s36, 4 -; VI-NEXT: v_writelane_b32 v32, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v32, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s47, v2 -; VI-NEXT: v_readfirstlane_b32 s46, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s44, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s42, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s40, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s14, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s12, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s10, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s8, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s6, v1 -; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: v_readfirstlane_b32 s47, v1 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s47, 3 -; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 -; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 -; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s14, s14, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s40, s40, 0xffff -; VI-NEXT: s_and_b32 s41, s41, 0xffff -; VI-NEXT: s_and_b32 s42, s42, 0xffff -; VI-NEXT: s_and_b32 s43, s43, 0xffff -; VI-NEXT: s_and_b32 s44, s44, 0xffff -; VI-NEXT: s_and_b32 s45, s45, 0xffff -; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s39, s6 -; VI-NEXT: s_or_b32 s7, s38, s7 -; VI-NEXT: s_or_b32 s29, s37, s29 -; VI-NEXT: s_or_b32 s28, s36, s28 -; VI-NEXT: s_or_b32 s27, s35, s27 -; VI-NEXT: s_or_b32 s26, s34, s26 -; VI-NEXT: s_or_b32 s25, s31, s25 -; VI-NEXT: s_or_b32 s24, s30, s24 -; VI-NEXT: s_or_b32 s23, vcc_hi, s23 -; VI-NEXT: s_or_b32 s22, vcc_lo, s22 -; VI-NEXT: s_or_b32 s21, s91, s21 -; VI-NEXT: s_or_b32 s20, s90, s20 -; VI-NEXT: s_or_b32 s19, s89, s19 -; VI-NEXT: s_or_b32 s18, s88, s18 -; VI-NEXT: s_or_b32 s17, s79, s17 -; VI-NEXT: s_or_b32 s16, s78, s16 -; VI-NEXT: s_or_b32 s8, s77, s8 -; VI-NEXT: s_or_b32 s9, s76, s9 -; VI-NEXT: s_or_b32 s10, s75, s10 -; VI-NEXT: s_or_b32 s11, s74, s11 -; VI-NEXT: s_or_b32 s12, s73, s12 -; VI-NEXT: s_or_b32 s13, s72, s13 -; VI-NEXT: s_or_b32 s14, s63, s14 -; VI-NEXT: s_or_b32 s15, s62, s15 -; VI-NEXT: s_or_b32 s40, s61, s40 -; VI-NEXT: s_or_b32 s41, s60, s41 -; VI-NEXT: s_or_b32 s42, s59, s42 -; VI-NEXT: s_or_b32 s43, s58, s43 -; VI-NEXT: s_or_b32 s44, s57, s44 -; VI-NEXT: s_or_b32 s45, s56, s45 -; VI-NEXT: s_or_b32 s46, s47, s46 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 -; VI-NEXT: s_add_i32 s40, s40, 0x30000 -; VI-NEXT: s_add_i32 s41, s41, 0x30000 -; VI-NEXT: s_add_i32 s42, s42, 0x30000 -; VI-NEXT: s_add_i32 s43, s43, 0x30000 -; VI-NEXT: s_add_i32 s44, s44, 0x30000 -; VI-NEXT: s_add_i32 s45, s45, 0x30000 -; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s5, s46, 3 ; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -114265,36 +113947,24 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s7 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s47 -; VI-NEXT: v_mov_b32_e32 v17, s46 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: v_mov_b32_e32 v20, s43 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v23, s40 -; VI-NEXT: v_mov_b32_e32 v24, s15 -; VI-NEXT: v_mov_b32_e32 v25, s14 -; VI-NEXT: v_mov_b32_e32 v26, s13 -; VI-NEXT: v_mov_b32_e32 v27, s12 -; VI-NEXT: v_mov_b32_e32 v28, s11 -; VI-NEXT: v_mov_b32_e32 v29, s10 -; VI-NEXT: v_mov_b32_e32 v30, s9 -; VI-NEXT: v_mov_b32_e32 v31, s8 -; VI-NEXT: v_readlane_b32 s39, v32, 7 -; VI-NEXT: v_readlane_b32 s38, v32, 6 -; VI-NEXT: v_readlane_b32 s37, v32, 5 -; VI-NEXT: v_readlane_b32 s36, v32, 4 -; VI-NEXT: v_readlane_b32 s35, v32, 3 -; VI-NEXT: v_readlane_b32 s34, v32, 2 -; VI-NEXT: v_readlane_b32 s31, v32, 1 -; VI-NEXT: v_readlane_b32 s30, v32, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: ; VI-NEXT: s_branch .LBB71_2 @@ -132135,255 +131805,277 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB75_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -132392,223 +132084,208 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v52, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v40, v21 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v58, v2 -; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -132639,535 +132316,441 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v36 -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: v_mov_b32_e32 v49, v38 -; VI-NEXT: v_mov_b32_e32 v44, v24 -; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 ; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v43, v59 -; VI-NEXT: v_mov_b32_e32 v52, v48 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB75_3: ; %Flow -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB75_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_or_b32_e32 v30, v30, v31 -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v32 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 -; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_e32 v26, v26, v33 -; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v28, v28, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v29, v29, v33 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB75_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -150532,201 +150115,190 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v32, s30, 0 -; VI-NEXT: v_writelane_b32 v32, s31, 1 -; VI-NEXT: v_writelane_b32 v32, s34, 2 -; VI-NEXT: v_writelane_b32 v32, s35, 3 -; VI-NEXT: v_writelane_b32 v32, s36, 4 -; VI-NEXT: v_writelane_b32 v32, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v32, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s47, v2 -; VI-NEXT: v_readfirstlane_b32 s46, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s44, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s42, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s40, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s14, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s12, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s10, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s8, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s6, v1 -; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: v_readfirstlane_b32 s47, v1 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s47, 3 -; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 -; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 -; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s14, s14, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s40, s40, 0xffff -; VI-NEXT: s_and_b32 s41, s41, 0xffff -; VI-NEXT: s_and_b32 s42, s42, 0xffff -; VI-NEXT: s_and_b32 s43, s43, 0xffff -; VI-NEXT: s_and_b32 s44, s44, 0xffff -; VI-NEXT: s_and_b32 s45, s45, 0xffff -; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s39, s6 -; VI-NEXT: s_or_b32 s7, s38, s7 -; VI-NEXT: s_or_b32 s29, s37, s29 -; VI-NEXT: s_or_b32 s28, s36, s28 -; VI-NEXT: s_or_b32 s27, s35, s27 -; VI-NEXT: s_or_b32 s26, s34, s26 -; VI-NEXT: s_or_b32 s25, s31, s25 -; VI-NEXT: s_or_b32 s24, s30, s24 -; VI-NEXT: s_or_b32 s23, vcc_hi, s23 -; VI-NEXT: s_or_b32 s22, vcc_lo, s22 -; VI-NEXT: s_or_b32 s21, s91, s21 -; VI-NEXT: s_or_b32 s20, s90, s20 -; VI-NEXT: s_or_b32 s19, s89, s19 -; VI-NEXT: s_or_b32 s18, s88, s18 -; VI-NEXT: s_or_b32 s17, s79, s17 -; VI-NEXT: s_or_b32 s16, s78, s16 -; VI-NEXT: s_or_b32 s8, s77, s8 -; VI-NEXT: s_or_b32 s9, s76, s9 -; VI-NEXT: s_or_b32 s10, s75, s10 -; VI-NEXT: s_or_b32 s11, s74, s11 -; VI-NEXT: s_or_b32 s12, s73, s12 -; VI-NEXT: s_or_b32 s13, s72, s13 -; VI-NEXT: s_or_b32 s14, s63, s14 -; VI-NEXT: s_or_b32 s15, s62, s15 -; VI-NEXT: s_or_b32 s40, s61, s40 -; VI-NEXT: s_or_b32 s41, s60, s41 -; VI-NEXT: s_or_b32 s42, s59, s42 -; VI-NEXT: s_or_b32 s43, s58, s43 -; VI-NEXT: s_or_b32 s44, s57, s44 -; VI-NEXT: s_or_b32 s45, s56, s45 -; VI-NEXT: s_or_b32 s46, s47, s46 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 -; VI-NEXT: s_add_i32 s40, s40, 0x30000 -; VI-NEXT: s_add_i32 s41, s41, 0x30000 -; VI-NEXT: s_add_i32 s42, s42, 0x30000 -; VI-NEXT: s_add_i32 s43, s43, 0x30000 -; VI-NEXT: s_add_i32 s44, s44, 0x30000 -; VI-NEXT: s_add_i32 s45, s45, 0x30000 -; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s5, s46, 3 ; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB87_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -150742,36 +150314,24 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s7 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s47 -; VI-NEXT: v_mov_b32_e32 v17, s46 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: v_mov_b32_e32 v20, s43 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v23, s40 -; VI-NEXT: v_mov_b32_e32 v24, s15 -; VI-NEXT: v_mov_b32_e32 v25, s14 -; VI-NEXT: v_mov_b32_e32 v26, s13 -; VI-NEXT: v_mov_b32_e32 v27, s12 -; VI-NEXT: v_mov_b32_e32 v28, s11 -; VI-NEXT: v_mov_b32_e32 v29, s10 -; VI-NEXT: v_mov_b32_e32 v30, s9 -; VI-NEXT: v_mov_b32_e32 v31, s8 -; VI-NEXT: v_readlane_b32 s39, v32, 7 -; VI-NEXT: v_readlane_b32 s38, v32, 6 -; VI-NEXT: v_readlane_b32 s37, v32, 5 -; VI-NEXT: v_readlane_b32 s36, v32, 4 -; VI-NEXT: v_readlane_b32 s35, v32, 3 -; VI-NEXT: v_readlane_b32 s34, v32, 2 -; VI-NEXT: v_readlane_b32 s31, v32, 1 -; VI-NEXT: v_readlane_b32 s30, v32, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: ; VI-NEXT: s_branch .LBB87_2 @@ -159217,273 +158777,275 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v46, v16 -; VI-NEXT: v_mov_b32_e32 v60, v5 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; VI-NEXT: v_mov_b32_e32 v62, v21 -; VI-NEXT: v_mov_b32_e32 v47, v17 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v18 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:88 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:204 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v22 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v2 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:220 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:280 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:312 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB89_2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB89_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v60 -; VI-NEXT: v_mov_b32_e32 v28, v26 -; VI-NEXT: v_mov_b32_e32 v26, v23 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v22 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v59, v10 -; VI-NEXT: v_mov_b32_e32 v58, v43 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v27, v14 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -159493,240 +159055,225 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v2, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v34, v40 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v57 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v60 -; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v38 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v56, v45 -; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v63, v0 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v57, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v53 -; VI-NEXT: v_mov_b32_e32 v55, v63 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v46 -; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v41, v52 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v63, v54 -; VI-NEXT: v_mov_b32_e32 v54, v49 -; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -159754,550 +159301,417 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: s_branch .LBB89_3 -; VI-NEXT: .LBB89_2: -; VI-NEXT: v_mov_b32_e32 v34, v40 -; VI-NEXT: v_mov_b32_e32 v57, v61 -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v56, v45 -; VI-NEXT: v_mov_b32_e32 v51, v42 -; VI-NEXT: v_mov_b32_e32 v39, v41 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: v_mov_b32_e32 v36, v49 -; VI-NEXT: v_mov_b32_e32 v35, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v53 -; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v52, v38 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: .LBB89_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v38, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: s_cbranch_vccnz .LBB89_5 -; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v47 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v53, v34 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_and_b32 s10, s24, 0xff -; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s10 -; VI-NEXT: s_and_b32 s12, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s11, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s12 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 -; VI-NEXT: s_and_b32 s10, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v39, 0xff, v39 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s10, 16 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v31, 24, v63 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 +; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 +; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v21, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x300, v21 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v23 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 -; VI-NEXT: v_lshlrev_b32_e32 v33, 24, v59 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v43, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v24 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v26 -; VI-NEXT: v_and_b32_e32 v26, 0xff, v43 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_or_b32_e32 v26, v33, v26 -; VI-NEXT: v_or_b32_sdwa v26, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_add_u32_e32 v44, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v27, 0xff, v44 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v6 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x300, v5 -; VI-NEXT: v_lshlrev_b32_e32 v32, 24, v49 -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 -; VI-NEXT: v_or_b32_e32 v4, v4, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v29, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_e32 v3, s4, v4 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v5, v29, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v6, v29, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v8, v29, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v9, v29, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v10, v29, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v35, 24, v56 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v37 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v14, v29, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v62 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v55 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v36 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v29, v29, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v53 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v19, v29, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v57 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v20, v29, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v38 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v22 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v55, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v54 -; VI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v40, 24, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v41, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; VI-NEXT: v_and_b32_e32 v41, 0xff, v41 -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; VI-NEXT: v_or_b32_e32 v55, v40, v55 -; VI-NEXT: v_or_b32_sdwa v23, v55, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v42, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v42 -; VI-NEXT: v_or_b32_e32 v54, v54, v55 -; VI-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v53, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_or_b32_e32 v35, v35, v53 -; VI-NEXT: v_or_b32_sdwa v25, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v52, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: v_or_b32_e32 v28, v28, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 -; VI-NEXT: v_or_b32_e32 v29, v29, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 -; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_e32 v31, v31, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 +; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: s_and_b32 s4, s26, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB89_5: ; %end +; VI-NEXT: .LBB89_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -160316,6 +159730,39 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v63, v3 +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB89_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX9: ; %bb.0: @@ -187376,273 +186823,275 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v46, v16 -; VI-NEXT: v_mov_b32_e32 v60, v5 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; VI-NEXT: v_mov_b32_e32 v62, v21 -; VI-NEXT: v_mov_b32_e32 v47, v17 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v18 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:88 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:204 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v22 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v2 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:220 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:280 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:312 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB93_2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB93_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v60 -; VI-NEXT: v_mov_b32_e32 v28, v26 -; VI-NEXT: v_mov_b32_e32 v26, v23 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v22 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v59, v10 -; VI-NEXT: v_mov_b32_e32 v58, v43 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v27, v14 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -187652,240 +187101,225 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v2, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v34, v40 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v57 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v60 -; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v38 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v56, v45 -; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v63, v0 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v57, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v53 -; VI-NEXT: v_mov_b32_e32 v55, v63 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v46 -; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v41, v52 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v63, v54 -; VI-NEXT: v_mov_b32_e32 v54, v49 -; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -187913,550 +187347,417 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: s_branch .LBB93_3 -; VI-NEXT: .LBB93_2: -; VI-NEXT: v_mov_b32_e32 v34, v40 -; VI-NEXT: v_mov_b32_e32 v57, v61 -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v56, v45 -; VI-NEXT: v_mov_b32_e32 v51, v42 -; VI-NEXT: v_mov_b32_e32 v39, v41 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: v_mov_b32_e32 v36, v49 -; VI-NEXT: v_mov_b32_e32 v35, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v53 -; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v52, v38 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: .LBB93_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v38, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: s_cbranch_vccnz .LBB93_5 -; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v47 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v53, v34 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_and_b32 s10, s24, 0xff -; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s10 -; VI-NEXT: s_and_b32 s12, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s11, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s12 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 -; VI-NEXT: s_and_b32 s10, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v39, 0xff, v39 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s10, 16 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v31, 24, v63 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 +; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 +; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v21, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x300, v21 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v23 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 -; VI-NEXT: v_lshlrev_b32_e32 v33, 24, v59 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v43, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v24 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v26 -; VI-NEXT: v_and_b32_e32 v26, 0xff, v43 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_or_b32_e32 v26, v33, v26 -; VI-NEXT: v_or_b32_sdwa v26, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_add_u32_e32 v44, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v27, 0xff, v44 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v6 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x300, v5 -; VI-NEXT: v_lshlrev_b32_e32 v32, 24, v49 -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 -; VI-NEXT: v_or_b32_e32 v4, v4, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v29, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_e32 v3, s4, v4 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v5, v29, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v6, v29, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v8, v29, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v9, v29, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v10, v29, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v35, 24, v56 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v37 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v14, v29, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v62 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v55 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v36 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v29, v29, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v53 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v19, v29, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v57 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v20, v29, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v38 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v22 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v55, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v54 -; VI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v40, 24, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v41, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; VI-NEXT: v_and_b32_e32 v41, 0xff, v41 -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; VI-NEXT: v_or_b32_e32 v55, v40, v55 -; VI-NEXT: v_or_b32_sdwa v23, v55, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v42, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v42 -; VI-NEXT: v_or_b32_e32 v54, v54, v55 -; VI-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v53, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_or_b32_e32 v35, v35, v53 -; VI-NEXT: v_or_b32_sdwa v25, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v52, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: v_or_b32_e32 v28, v28, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 -; VI-NEXT: v_or_b32_e32 v29, v29, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 -; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_e32 v31, v31, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 +; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: s_and_b32 s4, s26, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB93_5: ; %end +; VI-NEXT: .LBB93_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -188475,6 +187776,39 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v63, v3 +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB93_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX9: ; %bb.0: @@ -211143,273 +210477,275 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v46, v16 -; VI-NEXT: v_mov_b32_e32 v60, v5 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; VI-NEXT: v_mov_b32_e32 v62, v21 -; VI-NEXT: v_mov_b32_e32 v47, v17 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v18 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:88 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v19 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:204 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v22 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v2 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:256 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:220 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:280 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:312 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:232 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:200 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB97_2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v60 -; VI-NEXT: v_mov_b32_e32 v28, v26 -; VI-NEXT: v_mov_b32_e32 v26, v23 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v22 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v59, v10 -; VI-NEXT: v_mov_b32_e32 v58, v43 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v27, v14 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -211419,240 +210755,225 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v2, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v34, v40 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v57 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v60 -; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v38 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v56, v45 -; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v63, v0 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v57, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v53 -; VI-NEXT: v_mov_b32_e32 v55, v63 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v46 -; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v41, v52 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v63, v54 -; VI-NEXT: v_mov_b32_e32 v54, v49 -; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -211680,550 +211001,417 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: s_branch .LBB97_3 -; VI-NEXT: .LBB97_2: -; VI-NEXT: v_mov_b32_e32 v34, v40 -; VI-NEXT: v_mov_b32_e32 v57, v61 -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v56, v45 -; VI-NEXT: v_mov_b32_e32 v51, v42 -; VI-NEXT: v_mov_b32_e32 v39, v41 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: v_mov_b32_e32 v36, v49 -; VI-NEXT: v_mov_b32_e32 v35, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v53 -; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: v_mov_b32_e32 v52, v38 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: .LBB97_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v38, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: s_cbranch_vccnz .LBB97_5 -; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v47 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v53, v34 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_and_b32 s10, s24, 0xff -; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s10 -; VI-NEXT: s_and_b32 s12, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s11, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s12 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 -; VI-NEXT: s_and_b32 s10, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v39, 0xff, v39 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s10, 16 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v31, 24, v63 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 +; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 +; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v21, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x300, v21 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v23 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 -; VI-NEXT: v_lshlrev_b32_e32 v33, 24, v59 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v43, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v24 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v26 -; VI-NEXT: v_and_b32_e32 v26, 0xff, v43 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_or_b32_e32 v26, v33, v26 -; VI-NEXT: v_or_b32_sdwa v26, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_add_u32_e32 v44, vcc, 3, v51 -; VI-NEXT: v_and_b32_e32 v27, 0xff, v44 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v6 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x300, v5 -; VI-NEXT: v_lshlrev_b32_e32 v32, 24, v49 -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 -; VI-NEXT: v_or_b32_e32 v4, v4, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v29, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_e32 v3, s4, v4 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v5, v29, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v6, v29, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v8, v29, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v9, v29, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v10, v29, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v35 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v52 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v35, 24, v56 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v37 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v41 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v14, v29, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v62 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v55 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v36 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v29, v29, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v53 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v19, v29, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v57 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_or_b32_sdwa v20, v29, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v38 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v22 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_e32 v29, v29, v30 -; VI-NEXT: v_or_b32_sdwa v55, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v54 -; VI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v40, 24, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v41, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; VI-NEXT: v_and_b32_e32 v41, 0xff, v41 -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; VI-NEXT: v_or_b32_e32 v55, v40, v55 -; VI-NEXT: v_or_b32_sdwa v23, v55, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v42, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v42 -; VI-NEXT: v_or_b32_e32 v54, v54, v55 -; VI-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v53, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_or_b32_e32 v35, v35, v53 -; VI-NEXT: v_or_b32_sdwa v25, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v52, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: v_or_b32_e32 v28, v28, v32 -; VI-NEXT: v_or_b32_sdwa v28, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v34 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 -; VI-NEXT: v_or_b32_e32 v29, v29, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 -; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_e32 v31, v31, v32 -; VI-NEXT: v_or_b32_sdwa v30, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 +; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: s_and_b32 s4, s26, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB97_5: ; %end +; VI-NEXT: .LBB97_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -212242,6 +211430,39 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v63, v3 +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB97_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX9: ; %bb.0: @@ -222133,269 +221354,269 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true -; VI-NEXT: s_and_b32 s47, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s46, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_or_b32 s43, s46, s43 +; VI-NEXT: s_and_b32 s46, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_or_b32 s42, s46, s42 +; VI-NEXT: s_and_b32 s46, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_or_b32 s41, s46, s41 +; VI-NEXT: s_and_b32 s46, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_or_b32 s40, s46, s40 +; VI-NEXT: s_and_b32 s46, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_or_b32 s15, s46, s15 +; VI-NEXT: s_and_b32 s46, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_or_b32 s14, s46, s14 +; VI-NEXT: s_and_b32 s46, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_or_b32 s13, s46, s13 +; VI-NEXT: s_and_b32 s46, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_or_b32 s12, s46, s12 +; VI-NEXT: s_and_b32 s46, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_or_b32 s11, s46, s11 +; VI-NEXT: s_and_b32 s46, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s10, s46, s10 +; VI-NEXT: s_and_b32 s46, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s9, s46, s9 +; VI-NEXT: s_and_b32 s46, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s8, s46, s8 +; VI-NEXT: s_and_b32 s46, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s7, s46, s7 +; VI-NEXT: s_and_b32 s46, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s6, s46, s6 +; VI-NEXT: s_and_b32 s46, s5, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s5, s46, s5 +; VI-NEXT: s_and_b32 s46, s4, 0xffff0000 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s46, s4 +; VI-NEXT: s_and_b32 s46, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_or_b32 s17, s46, s17 +; VI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s16, s46, s16 +; VI-NEXT: s_and_b32 s46, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_or_b32 s19, s46, s19 +; VI-NEXT: s_and_b32 s46, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_or_b32 s18, s46, s18 +; VI-NEXT: s_and_b32 s46, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_or_b32 s21, s46, s21 +; VI-NEXT: s_and_b32 s46, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_or_b32 s20, s46, s20 +; VI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_or_b32 s23, s46, s23 +; VI-NEXT: s_and_b32 s46, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_or_b32 s22, s46, s22 +; VI-NEXT: s_and_b32 s46, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_or_b32 s25, s46, s25 +; VI-NEXT: s_and_b32 s46, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_or_b32 s24, s46, s24 +; VI-NEXT: s_and_b32 s46, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_or_b32 s27, s46, s27 +; VI-NEXT: s_and_b32 s46, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_or_b32 s26, s46, s26 +; VI-NEXT: s_and_b32 s46, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_or_b32 s29, s46, s29 +; VI-NEXT: s_and_b32 s46, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_or_b32 s28, s46, s28 +; VI-NEXT: s_and_b32 s46, s45, 0xffff0000 ; VI-NEXT: s_add_i32 s45, s45, 3 ; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_or_b32 s45, s46, s45 ; VI-NEXT: s_and_b32 s46, s44, 0xffff0000 ; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_or_b32 s45, s47, s45 ; VI-NEXT: s_add_i32 s45, s45, 0x30000 ; VI-NEXT: s_and_b32 s44, s44, 0xffff ; VI-NEXT: s_or_b32 s44, s46, s44 ; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: s_and_b32 s57, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_writelane_b32 v21, s46, 0 ; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: s_and_b32 s29, s29, 0xffff ; VI-NEXT: s_add_i32 s44, s44, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 1 ; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: s_or_b32 s29, s57, s29 ; VI-NEXT: v_writelane_b32 v21, s46, 2 ; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: s_and_b32 s56, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_add_i32 s29, s29, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 3 ; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: s_and_b32 s28, s28, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 4 ; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: s_and_b32 s59, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_or_b32 s28, s56, s28 ; VI-NEXT: v_writelane_b32 v21, s46, 5 ; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_and_b32 s27, s27, 0xffff ; VI-NEXT: s_add_i32 s28, s28, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 6 ; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: s_or_b32 s27, s59, s27 ; VI-NEXT: v_writelane_b32 v21, s46, 7 ; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: s_and_b32 s58, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s27, s27, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 8 ; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 9 ; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: s_and_b32 s61, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_or_b32 s26, s58, s26 ; VI-NEXT: v_writelane_b32 v21, s46, 10 ; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: s_and_b32 s25, s25, 0xffff ; VI-NEXT: s_add_i32 s26, s26, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 11 ; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: s_or_b32 s25, s61, s25 ; VI-NEXT: v_writelane_b32 v21, s46, 12 ; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: s_and_b32 s60, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_add_i32 s25, s25, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 13 ; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: s_and_b32 s24, s24, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 14 ; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: s_and_b32 s63, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_or_b32 s24, s60, s24 ; VI-NEXT: v_writelane_b32 v21, s46, 15 ; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_and_b32 s23, s23, 0xffff ; VI-NEXT: s_add_i32 s24, s24, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 16 ; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: s_or_b32 s23, s63, s23 ; VI-NEXT: v_writelane_b32 v21, s46, 17 ; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: s_and_b32 s62, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_add_i32 s23, s23, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 18 ; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: s_and_b32 s22, s22, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 19 ; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: s_and_b32 s73, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_or_b32 s22, s62, s22 ; VI-NEXT: v_writelane_b32 v21, s46, 20 ; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: s_and_b32 s79, s5, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s5, 3 -; VI-NEXT: s_and_b32 s21, s21, 0xffff ; VI-NEXT: s_add_i32 s22, s22, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 21 ; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s21, s73, s21 ; VI-NEXT: v_writelane_b32 v21, s46, 22 ; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: s_or_b32 s5, s79, s5 ; VI-NEXT: s_add_i32 s21, s21, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 23 ; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: s_and_b32 s78, s4, 0xffff0000 -; VI-NEXT: s_add_i32 s4, s4, 3 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 24 ; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 25 ; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: s_and_b32 s89, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_or_b32 s4, s78, s4 ; VI-NEXT: v_writelane_b32 v21, s46, 26 ; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 27 ; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: s_or_b32 s7, s89, s7 ; VI-NEXT: v_writelane_b32 v21, s46, 28 ; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: s_and_b32 s88, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 29 ; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 30 ; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: s_and_b32 s91, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_or_b32 s6, s88, s6 ; VI-NEXT: v_writelane_b32 v21, s46, 31 ; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 32 ; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: s_or_b32 s9, s91, s9 ; VI-NEXT: v_writelane_b32 v21, s46, 33 ; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: s_and_b32 s90, s8, 0xffff0000 -; VI-NEXT: s_add_i32 s8, s8, 3 ; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 34 ; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 35 ; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: s_and_b32 vcc_hi, s11, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_or_b32 s8, s90, s8 ; VI-NEXT: v_writelane_b32 v21, s46, 36 ; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: s_and_b32 s11, s11, 0xffff ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 37 ; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: s_or_b32 s11, vcc_hi, s11 ; VI-NEXT: v_writelane_b32 v21, s46, 38 ; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: s_and_b32 vcc_lo, s10, 0xffff0000 -; VI-NEXT: s_add_i32 s10, s10, 3 ; VI-NEXT: s_add_i32 s11, s11, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 39 ; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 40 ; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: s_and_b32 s31, s13, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_or_b32 s10, vcc_lo, s10 ; VI-NEXT: v_writelane_b32 v21, s46, 41 ; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: s_and_b32 s13, s13, 0xffff ; VI-NEXT: s_add_i32 s10, s10, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 42 ; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: s_or_b32 s13, s31, s13 ; VI-NEXT: v_writelane_b32 v21, s46, 43 ; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: s_and_b32 s30, s12, 0xffff0000 -; VI-NEXT: s_add_i32 s12, s12, 3 ; VI-NEXT: s_add_i32 s13, s13, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 44 ; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 45 ; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: s_and_b32 s35, s15, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_or_b32 s12, s30, s12 ; VI-NEXT: v_writelane_b32 v21, s46, 46 ; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: s_and_b32 s15, s15, 0xffff ; VI-NEXT: s_add_i32 s12, s12, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 47 ; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: s_or_b32 s15, s35, s15 ; VI-NEXT: v_writelane_b32 v21, s46, 48 ; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: s_and_b32 s34, s14, 0xffff0000 -; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 49 ; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 50 ; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: s_and_b32 s37, s41, 0xffff0000 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_or_b32 s14, s34, s14 ; VI-NEXT: v_writelane_b32 v21, s46, 51 ; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: s_and_b32 s41, s41, 0xffff ; VI-NEXT: s_add_i32 s14, s14, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 52 ; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: s_and_b32 s72, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s74, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s75, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s77, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s36, s40, 0xffff0000 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s38, s42, 0xffff0000 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s39, s43, 0xffff0000 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_or_b32 s41, s37, s41 ; VI-NEXT: v_writelane_b32 v21, s46, 53 ; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: s_and_b32 s43, s43, 0xffff -; VI-NEXT: s_and_b32 s42, s42, 0xffff ; VI-NEXT: s_add_i32 s41, s41, 0x30000 -; VI-NEXT: s_and_b32 s40, s40, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff ; VI-NEXT: v_writelane_b32 v21, s46, 54 ; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: s_or_b32 s43, s39, s43 -; VI-NEXT: s_or_b32 s42, s38, s42 -; VI-NEXT: s_or_b32 s40, s36, s40 -; VI-NEXT: s_or_b32 s17, s77, s17 -; VI-NEXT: s_or_b32 s16, s76, s16 -; VI-NEXT: s_or_b32 s19, s75, s19 -; VI-NEXT: s_or_b32 s18, s74, s18 -; VI-NEXT: s_or_b32 s20, s72, s20 ; VI-NEXT: v_writelane_b32 v21, s46, 55 ; VI-NEXT: s_lshr_b32 s46, s41, 24 ; VI-NEXT: s_add_i32 s43, s43, 0x30000 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 4cf1a71470c53..18fdc267851f6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1516,25 +1516,25 @@ define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 in ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -4573,9 +4573,9 @@ define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 in ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -4608,74 +4608,74 @@ define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB27_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v4i32_scalar: @@ -6051,25 +6051,25 @@ define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB39_3 ; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB39_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -9134,9 +9134,9 @@ define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -9169,74 +9169,74 @@ define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB51_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v4f32_scalar: @@ -10243,25 +10243,25 @@ define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 in ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB59_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -13302,9 +13302,9 @@ define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 in ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -13337,74 +13337,74 @@ define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB71_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v2i64_scalar: @@ -14049,25 +14049,25 @@ define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB75_3 ; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB75_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -17085,9 +17085,9 @@ define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -17120,74 +17120,74 @@ define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB87_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v2f64_scalar: @@ -19592,25 +19592,25 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s7, s19, 3 -; VI-NEXT: s_add_i32 s9, s16, 3 -; VI-NEXT: s_add_i32 s11, s17, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s10, 0x30000 -; VI-NEXT: s_add_i32 s16, s8, 0x30000 -; VI-NEXT: s_add_i32 s19, s6, 0x30000 ; VI-NEXT: s_add_i32 s18, s4, 0x30000 ; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 @@ -20618,9 +20618,9 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -20653,74 +20653,74 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB99_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v8i16_scalar: @@ -23589,9 +23589,9 @@ define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 i ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -23624,74 +23624,74 @@ define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 i ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB107_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB107_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v8f16_scalar: @@ -25982,9 +25982,9 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -26017,74 +26017,74 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB111_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB111_2 ; ; GFX9-LABEL: bitcast_v16i8_to_v8bf16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index 6e2ae809d5030..c87d52c1e6907 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -865,30 +865,30 @@ define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -2084,30 +2084,30 @@ define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index c366836520a82..c3ace0ac5af71 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1739,35 +1739,35 @@ define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -3891,35 +3891,35 @@ define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -5628,35 +5628,35 @@ define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6959,35 +6959,35 @@ define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index 48070b75804f5..c830d6b344b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -1003,40 +1003,40 @@ define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -2470,40 +2470,40 @@ define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index e46df60a93343..4a52cb9f6459a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1972,45 +1972,45 @@ define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -7081,39 +7081,37 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_mov_b32_e32 v24, v4 -; VI-NEXT: v_mov_b32_e32 v21, v2 -; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 ; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -7149,100 +7147,85 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v8 -; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v4, v4, v12 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: @@ -9329,45 +9312,45 @@ define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB39_3 ; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB39_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -14478,39 +14461,37 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_mov_b32_e32 v24, v4 -; VI-NEXT: v_mov_b32_e32 v21, v2 -; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 ; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -14546,100 +14527,85 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v8 -; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v4, v4, v12 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -16263,45 +16229,45 @@ define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB59_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -21382,39 +21348,37 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_mov_b32_e32 v24, v4 -; VI-NEXT: v_mov_b32_e32 v21, v2 -; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 ; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -21450,100 +21414,85 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v8 -; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v4, v4, v12 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: @@ -22715,45 +22664,45 @@ define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB75_3 ; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB75_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -27784,39 +27733,37 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v23, v5 -; VI-NEXT: v_mov_b32_e32 v24, v4 -; VI-NEXT: v_mov_b32_e32 v21, v2 -; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 ; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -27852,100 +27799,85 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v8 -; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v4, v4, v12 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB87_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: @@ -32135,45 +32067,45 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s7, s23, 3 -; VI-NEXT: s_add_i32 s9, s20, 3 -; VI-NEXT: s_add_i32 s11, s21, 3 -; VI-NEXT: s_and_b32 s12, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s18, 3 -; VI-NEXT: s_and_b32 s14, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s19, 3 -; VI-NEXT: s_and_b32 s18, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s19, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s10, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s17, s19, s17 -; VI-NEXT: s_or_b32 s16, s18, s16 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s19, s14, 0x30000 -; VI-NEXT: s_add_i32 s18, s12, 0x30000 -; VI-NEXT: s_add_i32 s21, s10, 0x30000 -; VI-NEXT: s_add_i32 s20, s8, 0x30000 -; VI-NEXT: s_add_i32 s23, s6, 0x30000 ; VI-NEXT: s_add_i32 s22, s4, 0x30000 ; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 @@ -33871,40 +33803,38 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_mov_b32_e32 v23, v4 -; VI-NEXT: v_mov_b32_e32 v19, v2 -; VI-NEXT: v_mov_b32_e32 v21, v1 -; VI-NEXT: v_mov_b32_e32 v22, v0 +; VI-NEXT: v_mov_b32_e32 v21, v6 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v22, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -33939,101 +33869,86 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v22 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v23 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v8 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_or_b32_e32 v3, v3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; VI-NEXT: v_or_b32_sdwa v5, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v21 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v11, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: v_or_b32_e32 v2, v2, v5 -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v22 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v19 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB99_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: @@ -39090,40 +39005,38 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_mov_b32_e32 v23, v4 -; VI-NEXT: v_mov_b32_e32 v19, v2 -; VI-NEXT: v_mov_b32_e32 v21, v1 -; VI-NEXT: v_mov_b32_e32 v22, v0 +; VI-NEXT: v_mov_b32_e32 v21, v6 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v22, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -39158,101 +39071,86 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v22 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v23 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v8 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_or_b32_e32 v3, v3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; VI-NEXT: v_or_b32_sdwa v5, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v21 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v11, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: v_or_b32_e32 v2, v2, v5 -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v22 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v19 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB107_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB107_4: @@ -43457,40 +43355,38 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_mov_b32_e32 v23, v4 -; VI-NEXT: v_mov_b32_e32 v19, v2 -; VI-NEXT: v_mov_b32_e32 v21, v1 -; VI-NEXT: v_mov_b32_e32 v22, v0 +; VI-NEXT: v_mov_b32_e32 v21, v6 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v22, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 -; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -43525,101 +43421,86 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v22 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v23 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v8 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_or_b32_e32 v3, v3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; VI-NEXT: v_or_b32_sdwa v5, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v21 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v11, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: v_or_b32_e32 v2, v2, v5 -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v22 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v19 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB111_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB111_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index eebfb11613d85..6cf53d187fcab 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1141,50 +1141,50 @@ define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -2874,50 +2874,50 @@ define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 1c6a2b24b1242..35ab38c67b1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -1209,55 +1209,55 @@ define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6106,52 +6106,49 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v34, v9 -; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v31, v8 ; VI-NEXT: v_mov_b32_e32 v30, v6 -; VI-NEXT: v_mov_b32_e32 v31, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v29, v2 -; VI-NEXT: v_mov_b32_e32 v28, v1 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v28, v2 ; VI-NEXT: v_mov_b32_e32 v27, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v0, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 ; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -6187,120 +6184,99 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v6, v6, v12 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v27 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v29 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v7, v7, v12 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v34 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_or_b32_sdwa v17, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 -; VI-NEXT: v_or_b32_e32 v4, v4, v19 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_or_b32_e32 v5, v5, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -8690,55 +8666,55 @@ define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -13617,52 +13593,49 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v34, v9 -; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v31, v8 ; VI-NEXT: v_mov_b32_e32 v30, v6 -; VI-NEXT: v_mov_b32_e32 v31, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v29, v2 -; VI-NEXT: v_mov_b32_e32 v28, v1 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v28, v2 ; VI-NEXT: v_mov_b32_e32 v27, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v0, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 ; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -13698,120 +13671,99 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB35_3 ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v6, v6, v12 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v27 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v29 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v7, v7, v12 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v34 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_or_b32_sdwa v17, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 -; VI-NEXT: v_or_b32_e32 v4, v4, v19 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_or_b32_e32 v5, v5, v13 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB35_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB35_4: @@ -18154,55 +18106,55 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s7, s25, 3 -; VI-NEXT: s_and_b32 s8, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s22, 3 -; VI-NEXT: s_and_b32 s10, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s23, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s20, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s21, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s22, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s23, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s17, s23, s17 -; VI-NEXT: s_or_b32 s16, s22, s16 -; VI-NEXT: s_or_b32 s19, s21, s19 -; VI-NEXT: s_or_b32 s18, s20, s18 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s23, s10, 0x30000 -; VI-NEXT: s_add_i32 s22, s8, 0x30000 -; VI-NEXT: s_add_i32 s25, s6, 0x30000 ; VI-NEXT: s_add_i32 s24, s4, 0x30000 ; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 @@ -20703,25 +20655,28 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v31, v13 -; VI-NEXT: v_mov_b32_e32 v36, v12 -; VI-NEXT: v_mov_b32_e32 v29, v10 -; VI-NEXT: v_mov_b32_e32 v33, v9 -; VI-NEXT: v_mov_b32_e32 v27, v8 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v27, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v29, v8 +; VI-NEXT: v_mov_b32_e32 v33, v6 ; VI-NEXT: v_mov_b32_e32 v30, v4 -; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -20754,154 +20709,126 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v38, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v28, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v6, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: v_or_b32_e32 v3, v3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v17 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v28 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; VI-NEXT: v_or_b32_e32 v5, v5, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; VI-NEXT: v_or_b32_sdwa v5, v38, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v17, v49, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: v_or_b32_e32 v2, v2, v7 -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v22 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v14, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -21901,55 +21828,55 @@ define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -22747,55 +22674,55 @@ define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -27278,25 +27205,28 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v31, v13 -; VI-NEXT: v_mov_b32_e32 v36, v12 -; VI-NEXT: v_mov_b32_e32 v29, v10 -; VI-NEXT: v_mov_b32_e32 v33, v9 -; VI-NEXT: v_mov_b32_e32 v27, v8 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v27, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v29, v8 +; VI-NEXT: v_mov_b32_e32 v33, v6 ; VI-NEXT: v_mov_b32_e32 v30, v4 -; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB63_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -27329,154 +27259,126 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v38, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v28, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB63_3 ; VI-NEXT: .LBB63_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v30 -; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v6, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: v_or_b32_e32 v3, v3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v17 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v28 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; VI-NEXT: v_or_b32_e32 v5, v5, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; VI-NEXT: v_or_b32_sdwa v5, v38, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v17, v49, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: v_or_b32_e32 v2, v2, v7 -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v22 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v14, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB63_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_4: @@ -31726,25 +31628,28 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v36, v14 -; VI-NEXT: v_mov_b32_e32 v37, v13 -; VI-NEXT: v_mov_b32_e32 v38, v12 -; VI-NEXT: v_mov_b32_e32 v29, v10 -; VI-NEXT: v_mov_b32_e32 v28, v9 -; VI-NEXT: v_mov_b32_e32 v27, v8 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v31, v5 -; VI-NEXT: v_mov_b32_e32 v30, v4 -; VI-NEXT: v_mov_b32_e32 v35, v2 -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v27, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB73_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -31777,33 +31682,26 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v32, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -31811,120 +31709,99 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB73_3 ; VI-NEXT: .LBB73_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v33 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v36 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v35 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v37 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v28 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_or_b32_sdwa v18, v50, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_sdwa v14, v48, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_or_b32_sdwa v12, v39, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v20 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: v_or_b32_e32 v4, v4, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v5, v5, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: v_or_b32_e32 v6, v6, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: v_or_b32_e32 v7, v7, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v7, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB73_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB73_4: @@ -36397,25 +36274,28 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v36, v14 -; VI-NEXT: v_mov_b32_e32 v37, v13 -; VI-NEXT: v_mov_b32_e32 v38, v12 -; VI-NEXT: v_mov_b32_e32 v29, v10 -; VI-NEXT: v_mov_b32_e32 v28, v9 -; VI-NEXT: v_mov_b32_e32 v27, v8 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v31, v5 -; VI-NEXT: v_mov_b32_e32 v30, v4 -; VI-NEXT: v_mov_b32_e32 v35, v2 -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v27, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB77_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -36448,33 +36328,26 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s7, s28, 0xff ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; VI-NEXT: v_or_b32_sdwa v0, v32, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -36482,120 +36355,99 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; VI-NEXT: s_cbranch_execnz .LBB77_3 ; VI-NEXT: .LBB77_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s19, 24 -; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s20, 0xff -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_and_b32 s10, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_and_b32 s8, s26, 0xff -; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v33 -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s28, 0xff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v36 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v35 -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v37 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v28 -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_or_b32_sdwa v18, v50, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_sdwa v14, v48, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_or_b32_sdwa v12, v39, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v20 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: v_or_b32_e32 v4, v4, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v5, v5, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 -; VI-NEXT: v_or_b32_e32 v6, v6, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 -; VI-NEXT: v_or_b32_e32 v7, v7, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s9, s9, 0x3000000 -; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v7, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB77_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB77_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index da529d9dd3048..7b756bce857bc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2463,17 +2463,17 @@ define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) { ; VI-NEXT: s_cbranch_execnz .LBB23_3 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_and_b32 s6, s16, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_add_i32 s6, s4, 0x3000000 ; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -4741,17 +4741,17 @@ define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_and_b32 s6, s16, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_add_i32 s6, s4, 0x3000000 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -6292,29 +6292,31 @@ define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inre ; VI-NEXT: s_cmp_lg_u32 s17, 0 ; VI-NEXT: s_cbranch_scc0 .LBB57_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s7, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 24 ; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_lshr_b32 s9, s16, 8 +; VI-NEXT: s_mov_b32 s7, s16 ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: s_add_i32 s6, s4, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshr_b32 s5, s16, 16 +; VI-NEXT: s_add_i32 s7, s16, 3 +; VI-NEXT: s_add_i32 s6, s5, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff ; VI-NEXT: s_lshl_b32 s5, s6, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_lshr_b32 s8, s4, 8 -; VI-NEXT: s_bfe_u32 s7, s6, 0x80008 +; VI-NEXT: s_lshr_b32 s9, s4, 8 +; VI-NEXT: s_bfe_u32 s8, s6, 0x80008 ; VI-NEXT: .LBB57_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB57_4: -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: s_branch .LBB57_2 ; ; GFX9-LABEL: bitcast_v2i16_to_v4i8_scalar: @@ -6733,17 +6735,17 @@ define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_and_b32 s6, s16, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_add_i32 s6, s4, 0x3000000 ; VI-NEXT: .LBB59_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -8405,17 +8407,17 @@ define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inr ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_and_b32 s6, s16, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_add_i32 s6, s4, 0x3000000 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -9822,17 +9824,17 @@ define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB79_3 ; VI-NEXT: .LBB79_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_and_b32 s6, s16, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_add_i32 s6, s4, 0x3000000 ; VI-NEXT: .LBB79_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -10564,17 +10566,17 @@ define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB83_3 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_and_b32 s6, s16, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_add_i32 s6, s4, 0x3000000 ; VI-NEXT: .LBB83_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index 5842662481e5d..6fc9a35cd9ee6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -1280,60 +1280,60 @@ define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -3253,60 +3253,60 @@ define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index fe3dd7ddc4174..c9860dbb7d72c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -2421,65 +2421,65 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -5616,65 +5616,65 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -8229,65 +8229,65 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -10289,65 +10289,65 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 25dd5c4e9499f..eaf314d4b65dc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2674,75 +2674,75 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s29, s41, s29 -; VI-NEXT: s_or_b32 s28, s40, s28 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6218,75 +6218,75 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s29, s41, s29 -; VI-NEXT: s_or_b32 s28, s40, s28 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -9145,75 +9145,75 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s29, s41, s29 -; VI-NEXT: s_or_b32 s28, s40, s28 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -11471,75 +11471,75 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s18, 3 -; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s19, 3 -; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s20, 3 -; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s21, 3 -; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s22, 3 -; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s23, 3 -; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s24, 3 -; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s25, 3 -; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s26, 3 -; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s29, s41, s29 -; VI-NEXT: s_or_b32 s28, s40, s28 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s26, 0x30000 -; VI-NEXT: s_add_i32 s26, s24, 0x30000 -; VI-NEXT: s_add_i32 s25, s22, 0x30000 -; VI-NEXT: s_add_i32 s24, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s18, 0x30000 -; VI-NEXT: s_add_i32 s22, s16, 0x30000 -; VI-NEXT: s_add_i32 s21, s14, 0x30000 -; VI-NEXT: s_add_i32 s20, s12, 0x30000 -; VI-NEXT: s_add_i32 s19, s10, 0x30000 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 7eaf481167b99..65fde2fd5e190 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2873,85 +2873,85 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 -; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -14346,124 +14346,142 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v39, v2 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 -; VI-NEXT: v_mov_b32_e32 v48, v14 -; VI-NEXT: v_mov_b32_e32 v49, v13 -; VI-NEXT: v_mov_b32_e32 v50, v12 -; VI-NEXT: v_mov_b32_e32 v33, v10 -; VI-NEXT: v_mov_b32_e32 v32, v9 -; VI-NEXT: v_mov_b32_e32 v31, v8 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v23, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -14493,192 +14511,156 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_or_b32_e32 v7, v7, v16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v8, v8, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v9, v9, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 -; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v10, v10, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 -; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v11, v11, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 -; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v12, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 -; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 -; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v13, v13, v17 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v6, v6, v21 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14699,16 +14681,43 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB27_2 ; @@ -18125,85 +18134,85 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB39_3 ; VI-NEXT: .LBB39_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 -; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB39_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -29696,124 +29705,142 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v39, v2 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 -; VI-NEXT: v_mov_b32_e32 v48, v14 -; VI-NEXT: v_mov_b32_e32 v49, v13 -; VI-NEXT: v_mov_b32_e32 v50, v12 -; VI-NEXT: v_mov_b32_e32 v33, v10 -; VI-NEXT: v_mov_b32_e32 v32, v9 -; VI-NEXT: v_mov_b32_e32 v31, v8 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v23, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -29843,192 +29870,156 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_or_b32_e32 v7, v7, v16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v8, v8, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v9, v9, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 -; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v10, v10, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 -; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v11, v11, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 -; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v12, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 -; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 -; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v13, v13, v17 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v6, v6, v21 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -30049,16 +30040,43 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB51_2 ; @@ -32826,85 +32844,85 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 -; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB59_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -44317,124 +44335,142 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v39, v2 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 -; VI-NEXT: v_mov_b32_e32 v48, v14 -; VI-NEXT: v_mov_b32_e32 v49, v13 -; VI-NEXT: v_mov_b32_e32 v50, v12 -; VI-NEXT: v_mov_b32_e32 v33, v10 -; VI-NEXT: v_mov_b32_e32 v32, v9 -; VI-NEXT: v_mov_b32_e32 v31, v8 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v23, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -44464,192 +44500,156 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_or_b32_e32 v7, v7, v16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v8, v8, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v9, v9, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 -; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v10, v10, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 -; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v11, v11, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 -; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v12, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 -; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 -; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v13, v13, v17 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v6, v6, v21 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -44670,16 +44670,43 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB71_2 ; @@ -46805,85 +46832,85 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB75_3 ; VI-NEXT: .LBB75_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 -; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB75_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -58092,124 +58119,142 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v39, v2 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 -; VI-NEXT: v_mov_b32_e32 v48, v14 -; VI-NEXT: v_mov_b32_e32 v49, v13 -; VI-NEXT: v_mov_b32_e32 v50, v12 -; VI-NEXT: v_mov_b32_e32 v33, v10 -; VI-NEXT: v_mov_b32_e32 v32, v9 -; VI-NEXT: v_mov_b32_e32 v31, v8 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v23, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -58239,192 +58284,156 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 -; VI-NEXT: v_or_b32_e32 v7, v7, v16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v8, v8, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v9, v9, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 -; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v10, v10, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 -; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v11, v11, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 -; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v12, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 -; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 -; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v13, v13, v17 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s17, 8 -; VI-NEXT: s_and_b32 s10, s16, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s10, s18, 0xff -; VI-NEXT: s_lshl_b32 s8, s19, 24 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s20, 0xff -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s22, 0xff -; VI-NEXT: s_lshl_b32 s4, s29, 8 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 -; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: v_or_b32_e32 v3, s4, v3 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v6, v6, v21 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB87_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -58445,16 +58454,43 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: -; VI-NEXT: v_mov_b32_e32 v19, v52 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v52, v53 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB87_2 ; @@ -67442,84 +67478,84 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 -; VI-NEXT: s_add_i32 s4, s4, 3 -; VI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s16, 3 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s19, 3 +; VI-NEXT: s_add_i32 s16, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s18, 3 +; VI-NEXT: s_add_i32 s19, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s21, 3 +; VI-NEXT: s_add_i32 s18, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s20, 3 +; VI-NEXT: s_add_i32 s21, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s23, 3 +; VI-NEXT: s_add_i32 s20, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s22, 3 +; VI-NEXT: s_add_i32 s23, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s25, 3 +; VI-NEXT: s_add_i32 s22, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s24, 3 +; VI-NEXT: s_add_i32 s25, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s27, 3 +; VI-NEXT: s_add_i32 s24, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s26, 3 +; VI-NEXT: s_add_i32 s27, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s29, 3 +; VI-NEXT: s_add_i32 s26, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s28, 3 +; VI-NEXT: s_add_i32 s29, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s28, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s5, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s5, 3 -; VI-NEXT: s_and_b32 s8, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s28, 3 -; VI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s29, 3 -; VI-NEXT: s_and_b32 s12, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s26, 3 -; VI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s27, 3 -; VI-NEXT: s_and_b32 s26, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s27, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s28, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 s29, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s40, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s41, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 s42, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s43, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s44, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s45, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 +; VI-NEXT: s_add_i32 s4, s4, 3 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s17, s45, s17 -; VI-NEXT: s_or_b32 s16, s44, s16 -; VI-NEXT: s_or_b32 s19, s43, s19 -; VI-NEXT: s_or_b32 s18, s42, s18 -; VI-NEXT: s_or_b32 s21, s41, s21 -; VI-NEXT: s_or_b32 s20, s40, s20 -; VI-NEXT: s_or_b32 s23, s29, s23 -; VI-NEXT: s_or_b32 s22, s28, s22 -; VI-NEXT: s_or_b32 s25, s27, s25 -; VI-NEXT: s_or_b32 s24, s26, s24 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s5, s7, s5 ; VI-NEXT: s_or_b32 s4, s6, s4 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s27, s14, 0x30000 -; VI-NEXT: s_add_i32 s26, s12, 0x30000 -; VI-NEXT: s_add_i32 s29, s10, 0x30000 -; VI-NEXT: s_add_i32 s28, s8, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 @@ -72110,145 +72146,135 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, v20 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v32, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v27 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v51, v23 +; VI-NEXT: v_mov_b32_e32 v30, v26 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v19, v13 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v19, v57 -; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: v_mov_b32_e32 v39, v14 ; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v21, v15 ; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v20, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v42, v17 -; VI-NEXT: v_mov_b32_e32 v35, v16 -; VI-NEXT: v_mov_b32_e32 v41, v21 -; VI-NEXT: v_mov_b32_e32 v38, v20 -; VI-NEXT: v_mov_b32_e32 v39, v25 -; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v31, v28 -; VI-NEXT: v_mov_b32_e32 v23, v50 -; VI-NEXT: v_mov_b32_e32 v50, v30 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v59, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v58 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -72276,214 +72302,153 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v19 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v50 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v32 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v43 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v38 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v41 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v31 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v39 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v52 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v61 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v48 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v51 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v38, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v40 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v53 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v59 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_or_b32_sdwa v48, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v37, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v19, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v30 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v26, v19, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v22, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_or_b32_e32 v3, v3, v13 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v22 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v23, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v27, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v23 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v27 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_e32 v4, v4, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_or_b32_e32 v5, v5, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; VI-NEXT: v_or_b32_e32 v6, v6, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; VI-NEXT: v_or_b32_e32 v8, v8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; VI-NEXT: v_or_b32_e32 v9, v9, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; VI-NEXT: v_or_b32_e32 v10, v10, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; VI-NEXT: v_or_b32_e32 v11, v11, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_e32 v2, v2, v13 -; VI-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: v_or_b32_sdwa v5, v5, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 -; VI-NEXT: v_or_b32_sdwa v7, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: .LBB99_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -72504,24 +72469,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v23, v50 -; VI-NEXT: v_mov_b32_e32 v19, v57 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v36, v13 -; VI-NEXT: v_mov_b32_e32 v42, v17 -; VI-NEXT: v_mov_b32_e32 v41, v21 -; VI-NEXT: v_mov_b32_e32 v39, v25 -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v35, v16 -; VI-NEXT: v_mov_b32_e32 v38, v20 -; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v31, v28 -; VI-NEXT: v_mov_b32_e32 v50, v30 -; VI-NEXT: v_mov_b32_e32 v49, v55 -; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_mov_b32_e32 v20, v5 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB99_2 ; @@ -85219,145 +85182,135 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, v20 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v32, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v27 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v51, v23 +; VI-NEXT: v_mov_b32_e32 v30, v26 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v19, v13 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v19, v57 -; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: v_mov_b32_e32 v39, v14 ; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v21, v15 ; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v20, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v42, v17 -; VI-NEXT: v_mov_b32_e32 v35, v16 -; VI-NEXT: v_mov_b32_e32 v41, v21 -; VI-NEXT: v_mov_b32_e32 v38, v20 -; VI-NEXT: v_mov_b32_e32 v39, v25 -; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v31, v28 -; VI-NEXT: v_mov_b32_e32 v23, v50 -; VI-NEXT: v_mov_b32_e32 v50, v30 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v59, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v58 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -85385,214 +85338,153 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v19 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v50 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v32 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v43 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v38 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v41 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v31 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v39 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v52 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v61 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v48 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v51 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v38, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v40 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v53 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v59 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_or_b32_sdwa v48, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v37, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v19, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v30 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v26, v19, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v22, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_or_b32_e32 v3, v3, v13 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v22 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v23, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v27, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v23 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v27 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_e32 v4, v4, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_or_b32_e32 v5, v5, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; VI-NEXT: v_or_b32_e32 v6, v6, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; VI-NEXT: v_or_b32_e32 v8, v8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; VI-NEXT: v_or_b32_e32 v9, v9, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; VI-NEXT: v_or_b32_e32 v10, v10, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; VI-NEXT: v_or_b32_e32 v11, v11, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_e32 v2, v2, v13 -; VI-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: v_or_b32_sdwa v5, v5, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 -; VI-NEXT: v_or_b32_sdwa v7, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: .LBB107_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -85613,24 +85505,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB107_4: -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v23, v50 -; VI-NEXT: v_mov_b32_e32 v19, v57 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v36, v13 -; VI-NEXT: v_mov_b32_e32 v42, v17 -; VI-NEXT: v_mov_b32_e32 v41, v21 -; VI-NEXT: v_mov_b32_e32 v39, v25 -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v35, v16 -; VI-NEXT: v_mov_b32_e32 v38, v20 -; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v31, v28 -; VI-NEXT: v_mov_b32_e32 v50, v30 -; VI-NEXT: v_mov_b32_e32 v49, v55 -; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_mov_b32_e32 v20, v5 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB107_2 ; @@ -96665,145 +96555,135 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, v20 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v32, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v27 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v51, v23 +; VI-NEXT: v_mov_b32_e32 v30, v26 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v14, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v26, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v13 -; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v19, v13 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v19, v57 -; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: v_mov_b32_e32 v39, v14 ; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v21, v15 ; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v20, v5 ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v42, v17 -; VI-NEXT: v_mov_b32_e32 v35, v16 -; VI-NEXT: v_mov_b32_e32 v41, v21 -; VI-NEXT: v_mov_b32_e32 v38, v20 -; VI-NEXT: v_mov_b32_e32 v39, v25 -; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v31, v28 -; VI-NEXT: v_mov_b32_e32 v23, v50 -; VI-NEXT: v_mov_b32_e32 v50, v30 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v59, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v58 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -96831,214 +96711,153 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v19 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v37 -; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v50 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v32 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v56 -; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v43 -; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v38 -; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v34 -; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v41 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v31 -; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v39 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s10, s29, 8 -; VI-NEXT: s_and_b32 s11, s28, 0xff -; VI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v52 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_and_b32 s11, s24, 0xff -; VI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v61 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 -; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v48 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s11, s20, 0xff -; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v51 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: v_or_b32_sdwa v38, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_or_b32 s8, s8, s11 -; VI-NEXT: s_and_b32 s11, s16, 0xff -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v40 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: s_or_b32 s7, s7, s11 -; VI-NEXT: s_and_b32 s13, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 24 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v53 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_and_b32 s12, s22, 0xff -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v60 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 -; VI-NEXT: s_and_b32 s11, s26, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s8, 0xffff -; VI-NEXT: s_lshl_b32 s8, s12, 16 -; VI-NEXT: s_lshl_b32 s4, s27, 24 -; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v59 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s11, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 -; VI-NEXT: v_or_b32_sdwa v48, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s10, 0xffff -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v37, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v19, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v30 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v26, v19, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v22, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; VI-NEXT: v_or_b32_e32 v3, v3, v13 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v22 -; VI-NEXT: v_or_b32_e32 v3, s7, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v23, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v27, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v23 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v27 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_e32 v4, v4, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_or_b32_e32 v5, v5, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; VI-NEXT: v_or_b32_e32 v6, v6, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; VI-NEXT: v_or_b32_e32 v8, v8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; VI-NEXT: v_or_b32_e32 v9, v9, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; VI-NEXT: v_or_b32_e32 v10, v10, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; VI-NEXT: v_or_b32_e32 v11, v11, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_e32 v2, v2, v13 -; VI-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 -; VI-NEXT: v_or_b32_sdwa v5, v5, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 -; VI-NEXT: v_or_b32_sdwa v7, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: .LBB111_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -97059,24 +96878,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB111_4: -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v23, v50 -; VI-NEXT: v_mov_b32_e32 v19, v57 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v36, v13 -; VI-NEXT: v_mov_b32_e32 v42, v17 -; VI-NEXT: v_mov_b32_e32 v41, v21 -; VI-NEXT: v_mov_b32_e32 v39, v25 -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v35, v16 -; VI-NEXT: v_mov_b32_e32 v38, v20 -; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v31, v28 -; VI-NEXT: v_mov_b32_e32 v50, v30 -; VI-NEXT: v_mov_b32_e32 v49, v55 -; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_mov_b32_e32 v20, v5 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB111_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 5624a08cd89fc..dda05a8897979 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -4435,77 +4435,77 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v36i16_to_v18i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: v_mov_b32_e32 v32, v3 ; VI-NEXT: v_mov_b32_e32 v33, v2 -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -4525,106 +4525,106 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -12080,77 +12080,77 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-LABEL: bitcast_v36i16_to_v18f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: v_mov_b32_e32 v32, v3 ; VI-NEXT: v_mov_b32_e32 v33, v2 -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -12170,106 +12170,106 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -18943,77 +18943,77 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-LABEL: bitcast_v36i16_to_v9i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: v_mov_b32_e32 v32, v3 ; VI-NEXT: v_mov_b32_e32 v33, v2 -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -19033,106 +19033,106 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -25028,77 +25028,77 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-LABEL: bitcast_v36i16_to_v9f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: v_mov_b32_e32 v32, v3 ; VI-NEXT: v_mov_b32_e32 v33, v2 -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -25118,106 +25118,106 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -29967,42 +29967,42 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 64b6ca9e6117e..0ac06bbd1b996 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -4725,85 +4725,85 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v40i16_to_v20i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v34, v3 ; VI-NEXT: v_mov_b32_e32 v35, v2 -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -4821,114 +4821,114 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -12827,85 +12827,85 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-LABEL: bitcast_v40i16_to_v20f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v34, v3 ; VI-NEXT: v_mov_b32_e32 v35, v2 -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -12923,114 +12923,114 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -20241,85 +20241,85 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v40i16_to_v10i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v34, v3 ; VI-NEXT: v_mov_b32_e32 v35, v2 -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -20337,114 +20337,114 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -26882,85 +26882,85 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-LABEL: bitcast_v40i16_to_v10f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v34, v3 ; VI-NEXT: v_mov_b32_e32 v35, v2 -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_lshl_b32 s59, s11, 16 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -26978,114 +26978,114 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -32311,46 +32311,46 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index abb312899114e..a6e041b2d8300 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1300,15 +1300,15 @@ define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -3440,29 +3440,29 @@ define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB27_3: ; %end @@ -4575,15 +4575,15 @@ define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB39_3 ; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB39_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6701,29 +6701,29 @@ define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB51_3: ; %end @@ -7522,15 +7522,15 @@ define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 in ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB59_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -9658,29 +9658,29 @@ define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB71_3: ; %end @@ -10156,15 +10156,15 @@ define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB75_3 ; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB75_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -12312,29 +12312,29 @@ define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 in ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB87_3: ; %end @@ -13984,46 +13984,48 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 -; VI-NEXT: s_lshr_b32 s9, s17, 24 +; VI-NEXT: s_lshr_b32 s10, s17, 24 ; VI-NEXT: s_lshr_b32 s8, s17, 16 ; VI-NEXT: s_lshr_b32 s5, s17, 8 -; VI-NEXT: s_lshr_b32 s10, s16, 16 -; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s12, s16, 8 +; VI-NEXT: s_mov_b32 s9, s17 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: s_add_i32 s8, s4, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s5, s16, 0xffff0000 +; VI-NEXT: s_lshr_b32 s5, s17, 16 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_add_i32 s8, s5, 3 +; VI-NEXT: s_and_b32 s4, s9, 0xffff +; VI-NEXT: s_lshl_b32 s5, s8, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s17, 0xffff -; VI-NEXT: s_lshl_b32 s6, s8, 16 -; VI-NEXT: s_or_b32 s7, s4, s6 -; VI-NEXT: s_and_b32 s4, s16, 0xffff -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 ; VI-NEXT: s_lshr_b32 s5, s7, 8 -; VI-NEXT: s_lshr_b32 s10, s6, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 8 -; VI-NEXT: s_bfe_u32 s9, s8, 0x80008 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s6, 8 +; VI-NEXT: s_bfe_u32 s10, s8, 0x80008 ; VI-NEXT: .LBB97_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v6, s8 -; VI-NEXT: v_mov_b32_e32 v7, s9 +; VI-NEXT: v_mov_b32_e32 v7, s10 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: s_branch .LBB97_2 ; ; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar: @@ -14655,29 +14657,29 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB99_3: ; %end @@ -16582,29 +16584,29 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB107_3: ; %end @@ -17999,29 +18001,29 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s17, 8 -; VI-NEXT: s_and_b32 s8, s16, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: .LBB111_3: ; %end diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 051c60e59acc6..d8fe5f27e9ac8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -5118,91 +5118,91 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v44i16_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 ; VI-NEXT: v_mov_b32_e32 v34, v5 ; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_mov_b32_e32 v36, v3 ; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -5219,123 +5219,123 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -14006,91 +14006,91 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-LABEL: bitcast_v44i16_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 ; VI-NEXT: v_mov_b32_e32 v34, v5 ; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_mov_b32_e32 v36, v3 ; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -14107,123 +14107,123 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -22184,91 +22184,91 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v44i16_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 ; VI-NEXT: v_mov_b32_e32 v34, v5 ; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_mov_b32_e32 v36, v3 ; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -22285,123 +22285,123 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -29524,91 +29524,91 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-LABEL: bitcast_v44i16_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 ; VI-NEXT: v_mov_b32_e32 v34, v5 ; VI-NEXT: v_mov_b32_e32 v35, v4 ; VI-NEXT: v_mov_b32_e32 v36, v3 ; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_lshl_b32 s61, s9, 16 ; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: s_lshl_b32 s63, s7, 16 ; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -29625,123 +29625,123 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -35617,50 +35617,50 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index dfd5c09f77b1d..79adc25903ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -5560,97 +5560,97 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: v_mov_b32_e32 v49, v9 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v35, v7 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v36, v3 -; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -5667,131 +5667,131 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -15349,97 +15349,97 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: v_mov_b32_e32 v49, v9 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v35, v7 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v36, v3 -; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -15456,131 +15456,131 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -24397,97 +24397,97 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: v_mov_b32_e32 v49, v9 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v35, v7 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v36, v3 -; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -24504,131 +24504,131 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -32568,97 +32568,97 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: v_mov_b32_e32 v49, v9 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v35, v7 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v36, v3 -; VI-NEXT: v_mov_b32_e32 v37, v2 -; VI-NEXT: v_mov_b32_e32 v39, v1 -; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 -; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -32675,131 +32675,131 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 -; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -39384,54 +39384,54 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index eac4794012a9f..e19eba6270957 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -5996,103 +5996,103 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v39, v11 -; VI-NEXT: v_mov_b32_e32 v37, v10 -; VI-NEXT: v_mov_b32_e32 v35, v9 -; VI-NEXT: v_mov_b32_e32 v34, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 ; VI-NEXT: v_mov_b32_e32 v48, v3 ; VI-NEXT: v_mov_b32_e32 v49, v2 -; VI-NEXT: v_mov_b32_e32 v51, v1 -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 -; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -6109,139 +6109,139 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -16690,103 +16690,103 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v39, v11 -; VI-NEXT: v_mov_b32_e32 v37, v10 -; VI-NEXT: v_mov_b32_e32 v35, v9 -; VI-NEXT: v_mov_b32_e32 v34, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 ; VI-NEXT: v_mov_b32_e32 v48, v3 ; VI-NEXT: v_mov_b32_e32 v49, v2 -; VI-NEXT: v_mov_b32_e32 v51, v1 -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 -; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -16803,139 +16803,139 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -26585,103 +26585,103 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v39, v11 -; VI-NEXT: v_mov_b32_e32 v37, v10 -; VI-NEXT: v_mov_b32_e32 v35, v9 -; VI-NEXT: v_mov_b32_e32 v34, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 ; VI-NEXT: v_mov_b32_e32 v48, v3 ; VI-NEXT: v_mov_b32_e32 v49, v2 -; VI-NEXT: v_mov_b32_e32 v51, v1 -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 -; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -26698,139 +26698,139 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -35574,103 +35574,103 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v39, v11 -; VI-NEXT: v_mov_b32_e32 v37, v10 -; VI-NEXT: v_mov_b32_e32 v35, v9 -; VI-NEXT: v_mov_b32_e32 v34, v8 -; VI-NEXT: v_mov_b32_e32 v33, v7 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v36, v5 -; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 ; VI-NEXT: v_mov_b32_e32 v48, v3 ; VI-NEXT: v_mov_b32_e32 v49, v2 -; VI-NEXT: v_mov_b32_e32 v51, v1 -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 -; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -35687,139 +35687,139 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -43223,58 +43223,58 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 5ed1db9e65839..66242a3cf45d8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -6450,109 +6450,109 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v39, v13 -; VI-NEXT: v_mov_b32_e32 v37, v12 -; VI-NEXT: v_mov_b32_e32 v35, v11 -; VI-NEXT: v_mov_b32_e32 v34, v10 -; VI-NEXT: v_mov_b32_e32 v33, v9 -; VI-NEXT: v_mov_b32_e32 v32, v8 -; VI-NEXT: v_mov_b32_e32 v36, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 ; VI-NEXT: v_mov_b32_e32 v48, v5 ; VI-NEXT: v_mov_b32_e32 v49, v4 ; VI-NEXT: v_mov_b32_e32 v50, v3 ; VI-NEXT: v_mov_b32_e32 v51, v2 -; VI-NEXT: v_mov_b32_e32 v53, v1 -; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 -; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -6569,147 +6569,147 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -18063,109 +18063,109 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v39, v13 -; VI-NEXT: v_mov_b32_e32 v37, v12 -; VI-NEXT: v_mov_b32_e32 v35, v11 -; VI-NEXT: v_mov_b32_e32 v34, v10 -; VI-NEXT: v_mov_b32_e32 v33, v9 -; VI-NEXT: v_mov_b32_e32 v32, v8 -; VI-NEXT: v_mov_b32_e32 v36, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 ; VI-NEXT: v_mov_b32_e32 v48, v5 ; VI-NEXT: v_mov_b32_e32 v49, v4 ; VI-NEXT: v_mov_b32_e32 v50, v3 ; VI-NEXT: v_mov_b32_e32 v51, v2 -; VI-NEXT: v_mov_b32_e32 v53, v1 -; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 -; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -18182,147 +18182,147 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -28817,109 +28817,109 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v39, v13 -; VI-NEXT: v_mov_b32_e32 v37, v12 -; VI-NEXT: v_mov_b32_e32 v35, v11 -; VI-NEXT: v_mov_b32_e32 v34, v10 -; VI-NEXT: v_mov_b32_e32 v33, v9 -; VI-NEXT: v_mov_b32_e32 v32, v8 -; VI-NEXT: v_mov_b32_e32 v36, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 ; VI-NEXT: v_mov_b32_e32 v48, v5 ; VI-NEXT: v_mov_b32_e32 v49, v4 ; VI-NEXT: v_mov_b32_e32 v50, v3 ; VI-NEXT: v_mov_b32_e32 v51, v2 -; VI-NEXT: v_mov_b32_e32 v53, v1 -; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 -; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -28936,147 +28936,147 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -38622,109 +38622,109 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v39, v13 -; VI-NEXT: v_mov_b32_e32 v37, v12 -; VI-NEXT: v_mov_b32_e32 v35, v11 -; VI-NEXT: v_mov_b32_e32 v34, v10 -; VI-NEXT: v_mov_b32_e32 v33, v9 -; VI-NEXT: v_mov_b32_e32 v32, v8 -; VI-NEXT: v_mov_b32_e32 v36, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 ; VI-NEXT: v_mov_b32_e32 v48, v5 ; VI-NEXT: v_mov_b32_e32 v49, v4 ; VI-NEXT: v_mov_b32_e32 v50, v3 ; VI-NEXT: v_mov_b32_e32 v51, v2 -; VI-NEXT: v_mov_b32_e32 v53, v1 -; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 -; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -38741,147 +38741,147 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff -; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 -; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -47023,62 +47023,62 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 97e880e1bf488..b480e89dfcc30 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -6869,115 +6869,115 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v39, v15 -; VI-NEXT: v_mov_b32_e32 v37, v14 -; VI-NEXT: v_mov_b32_e32 v35, v13 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v33, v11 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: v_mov_b32_e32 v36, v9 -; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 ; VI-NEXT: v_mov_b32_e32 v48, v7 ; VI-NEXT: v_mov_b32_e32 v49, v6 ; VI-NEXT: v_mov_b32_e32 v50, v5 ; VI-NEXT: v_mov_b32_e32 v51, v4 ; VI-NEXT: v_mov_b32_e32 v52, v3 ; VI-NEXT: v_mov_b32_e32 v53, v2 -; VI-NEXT: v_mov_b32_e32 v55, v1 -; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 -; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -6995,154 +6995,154 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: @@ -19425,115 +19425,115 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v39, v15 -; VI-NEXT: v_mov_b32_e32 v37, v14 -; VI-NEXT: v_mov_b32_e32 v35, v13 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v33, v11 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: v_mov_b32_e32 v36, v9 -; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 ; VI-NEXT: v_mov_b32_e32 v48, v7 ; VI-NEXT: v_mov_b32_e32 v49, v6 ; VI-NEXT: v_mov_b32_e32 v50, v5 ; VI-NEXT: v_mov_b32_e32 v51, v4 ; VI-NEXT: v_mov_b32_e32 v52, v3 ; VI-NEXT: v_mov_b32_e32 v53, v2 -; VI-NEXT: v_mov_b32_e32 v55, v1 -; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 -; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -19551,154 +19551,154 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB31_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: @@ -31083,115 +31083,115 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v39, v15 -; VI-NEXT: v_mov_b32_e32 v37, v14 -; VI-NEXT: v_mov_b32_e32 v35, v13 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v33, v11 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: v_mov_b32_e32 v36, v9 -; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 ; VI-NEXT: v_mov_b32_e32 v48, v7 ; VI-NEXT: v_mov_b32_e32 v49, v6 ; VI-NEXT: v_mov_b32_e32 v50, v5 ; VI-NEXT: v_mov_b32_e32 v51, v4 ; VI-NEXT: v_mov_b32_e32 v52, v3 ; VI-NEXT: v_mov_b32_e32 v53, v2 -; VI-NEXT: v_mov_b32_e32 v55, v1 -; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 -; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -31209,154 +31209,154 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: @@ -41743,115 +41743,115 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s43, s29, 16 -; VI-NEXT: s_lshr_b32 s42, s28, 16 -; VI-NEXT: s_lshr_b32 s41, s27, 16 -; VI-NEXT: s_lshr_b32 s40, s26, 16 -; VI-NEXT: s_lshr_b32 s15, s25, 16 -; VI-NEXT: s_lshr_b32 s14, s24, 16 -; VI-NEXT: s_lshr_b32 s13, s23, 16 -; VI-NEXT: s_lshr_b32 s12, s22, 16 -; VI-NEXT: s_lshr_b32 s11, s21, 16 -; VI-NEXT: s_lshr_b32 s10, s20, 16 -; VI-NEXT: s_lshr_b32 s9, s19, 16 -; VI-NEXT: s_lshr_b32 s8, s18, 16 -; VI-NEXT: s_lshr_b32 s7, s17, 16 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v39, v15 -; VI-NEXT: v_mov_b32_e32 v37, v14 -; VI-NEXT: v_mov_b32_e32 v35, v13 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v33, v11 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: v_mov_b32_e32 v36, v9 -; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 ; VI-NEXT: v_mov_b32_e32 v48, v7 ; VI-NEXT: v_mov_b32_e32 v49, v6 ; VI-NEXT: v_mov_b32_e32 v50, v5 ; VI-NEXT: v_mov_b32_e32 v51, v4 ; VI-NEXT: v_mov_b32_e32 v52, v3 ; VI-NEXT: v_mov_b32_e32 v53, v2 -; VI-NEXT: v_mov_b32_e32 v55, v1 -; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s44, s7, 16 -; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_lshl_b32 s45, s41, 16 ; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_lshl_b32 s47, s15, 16 ; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_lshl_b32 s57, s13, 16 ; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_lshl_b32 s58, s12, 16 ; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s59, s14, 16 -; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s60, s15, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s61, s40, 16 -; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s62, s41, 16 -; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s63, s42, 16 -; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s72, s43, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s44 @@ -41869,154 +41869,154 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xffff ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s9, s9, s16 -; VI-NEXT: s_and_b32 s16, s20, 0xffff -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s11, s11, 16 -; VI-NEXT: s_or_b32 s10, s10, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xffff -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s11, s11, s16 -; VI-NEXT: s_and_b32 s16, s22, 0xffff -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s13, s13, 16 -; VI-NEXT: s_or_b32 s12, s12, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s13, s13, s16 -; VI-NEXT: s_and_b32 s16, s24, 0xffff -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_lshl_b32 s5, s42, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 -; VI-NEXT: s_lshl_b32 s4, s43, 16 -; VI-NEXT: s_lshl_b32 s41, s41, 16 -; VI-NEXT: s_lshl_b32 s40, s40, 16 -; VI-NEXT: s_or_b32 s15, s15, s16 -; VI-NEXT: s_and_b32 s16, s26, 0xffff -; VI-NEXT: s_and_b32 s17, s27, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: s_or_b32 s16, s40, s16 -; VI-NEXT: s_or_b32 s17, s41, s17 -; VI-NEXT: s_or_b32 s4, s4, s18 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 ; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_mov_b32_e32 v7, s13 -; VI-NEXT: v_mov_b32_e32 v8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_mov_b32_e32 v11, s17 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: @@ -50943,66 +50943,66 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 8d945ea75e761..f888f4f3b1407 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1489,40 +1489,40 @@ define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 in ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s8, s17, 8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_and_b32 s8, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s27, 24 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 @@ -3461,20 +3461,20 @@ define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 in ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB19_3 ; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB19_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -4694,40 +4694,40 @@ define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB23_3 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s8, s17, 8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_and_b32 s8, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s27, 24 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 @@ -6675,20 +6675,20 @@ define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB35_3 ; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s18, s8, 0x30000 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: .LBB35_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -7353,40 +7353,40 @@ define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB37_3 ; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s8, s17, 8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_and_b32 s8, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s27, 24 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 @@ -9226,40 +9226,40 @@ define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 i ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s8, s17, 8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_and_b32 s8, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s27, 24 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 @@ -10724,40 +10724,40 @@ define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 in ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s8, s17, 8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s9, s18, 0xff -; VI-NEXT: s_lshl_b32 s4, s19, 24 -; VI-NEXT: s_addk_i32 s8, 0x300 -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s21, 8 -; VI-NEXT: s_or_b32 s4, s4, s8 -; VI-NEXT: s_and_b32 s8, s20, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s22, 0xff -; VI-NEXT: s_lshl_b32 s5, s23, 24 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_lshl_b32 s7, s27, 24 -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 @@ -11363,20 +11363,20 @@ define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 in ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB47_3 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_add_i32 s7, s16, 3 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_add_i32 s16, s6, 0x30000 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 ; VI-NEXT: s_add_i32 s18, s4, 0x30000 ; VI-NEXT: s_lshr_b32 s19, s16, 8 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 51afa79674a80..338dd9dedd37e 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index 155042c5fc3c3..8ed8d905c5512 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -21,21 +21,20 @@ define amdgpu_kernel void @s_ashr_v2i16(ptr addrspace(1) %out, i32, <2 x i16> %l ; ; VI-LABEL: s_ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x38 +; VI-NEXT: s_load_dword s7, s[4:5], 0x30 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_load_dword s4, s[4:5], 0x38 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s6, 16 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_ashr_i32 s7, s4, 16 -; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_ashr_i32 s5, s5, s7 -; VI-NEXT: s_ashr_i32 s4, s6, s4 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_ashr_i32 s5, s7, 16 +; VI-NEXT: s_ashr_i32 s4, s5, s4 +; VI-NEXT: s_sext_i32_i16 s5, s7 +; VI-NEXT: s_ashr_i32 s5, s5, s6 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index a9358dc4a51d8..ab078be5c13a3 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -52,7 +52,6 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -80,11 +79,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_brev_b32 s2, s2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-GISEL-TRUE16-NEXT: s_endpgm @@ -96,11 +93,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_brev_b32 s2, s2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-GISEL-FAKE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index bc8e21e03251d..a1aef8ddf6bba 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1004,7 +1004,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; VI-LABEL: ps_mesa_inreg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_and_b32 s0, 0xffff, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[0:1], v0 @@ -1012,9 +1011,8 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; GFX11-LABEL: ps_mesa_inreg_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_endpgm @@ -1156,20 +1154,20 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_lshr_b32 s1, s0, 24 -; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_bfe_u32 s3, s0, 0x80008 +; VI-NEXT: s_add_i32 s2, s2, s2 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: s_and_b32 s1, s1, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 24 -; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_or_b32 s1, s1, s2 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s2, s3, 8 ; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s1, s1, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1240,8 +1238,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: s_bfe_u32 s2, s0, 0x80008 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v2, s1 @@ -1318,22 +1316,21 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v0, 4 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_lshr_b32 s2, s0, 24 -; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 24 ; VI-NEXT: s_bfe_u32 s4, s0, 0x80008 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_add_i32 s4, s4, s4 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s2, s2, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_add_i32 s1, s1, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: s_or_b32 s0, s0, s2 @@ -1430,37 +1427,37 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s1, 16 -; VI-NEXT: s_lshr_b32 s2, s1, 24 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 24 +; VI-NEXT: s_lshr_b32 s4, s1, 16 +; VI-NEXT: s_lshr_b32 s5, s1, 24 +; VI-NEXT: s_bfe_u32 s6, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s7, s1, 0x80008 +; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_bfe_u32 s6, s1, 0x80008 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_lshl_b32 s2, s2, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s3, s6, 8 -; VI-NEXT: s_lshr_b32 s4, s0, 24 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_or_b32 s1, s1, s3 -; VI-NEXT: s_bfe_u32 s7, s0, 0x80008 -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s3, s5, 0xff ; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s4, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s5, s7, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s3, s7, 8 +; VI-NEXT: s_lshl_b32 s3, s6, 8 +; VI-NEXT: s_or_b32 s1, s1, s5 ; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s1, s1, s4 ; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1599,69 +1596,69 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s4, s3, 24 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: s_lshr_b32 s5, s0, 24 +; VI-NEXT: s_lshr_b32 s6, s1, 16 +; VI-NEXT: s_lshr_b32 s7, s1, 24 +; VI-NEXT: s_lshr_b32 s8, s2, 16 +; VI-NEXT: s_lshr_b32 s9, s2, 24 +; VI-NEXT: s_lshr_b32 s10, s3, 16 +; VI-NEXT: s_lshr_b32 s11, s3, 24 +; VI-NEXT: s_bfe_u32 s12, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s13, s1, 0x80008 +; VI-NEXT: s_bfe_u32 s14, s2, 0x80008 +; VI-NEXT: s_bfe_u32 s15, s3, 0x80008 +; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_add_i32 s10, s10, s10 +; VI-NEXT: s_add_i32 s9, s9, s9 +; VI-NEXT: s_add_i32 s8, s8, s8 +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s6, s6, s6 ; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_bfe_u32 s12, s3, 0x80008 ; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_lshl_b32 s4, s4, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s2, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_lshl_b32 s5, s12, 8 -; VI-NEXT: s_lshr_b32 s6, s2, 24 -; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: s_or_b32 s3, s3, s5 -; VI-NEXT: s_bfe_u32 s13, s2, 0x80008 -; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s5, s7, 0xff -; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_or_b32 s3, s3, s4 -; VI-NEXT: s_lshl_b32 s4, s6, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s9, s1, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s5, s13, 8 -; VI-NEXT: s_lshr_b32 s8, s1, 24 -; VI-NEXT: s_add_i32 s9, s9, s9 -; VI-NEXT: s_or_b32 s2, s2, s5 -; VI-NEXT: s_bfe_u32 s14, s1, 0x80008 -; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s5, s9, 0xff ; VI-NEXT: s_add_i32 s14, s14, s14 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: s_lshl_b32 s4, s8, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s11, s0, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s5, s14, 8 -; VI-NEXT: s_lshr_b32 s10, s0, 24 -; VI-NEXT: s_add_i32 s11, s11, s11 -; VI-NEXT: s_or_b32 s1, s1, s5 -; VI-NEXT: s_bfe_u32 s15, s0, 0x80008 -; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s5, s11, 0xff -; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s4 -; VI-NEXT: s_lshl_b32 s4, s10, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s11, s15, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s9, s14, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s7, s13, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_lshl_b32 s5, s12, 8 +; VI-NEXT: s_or_b32 s3, s3, s11 +; VI-NEXT: s_or_b32 s2, s2, s9 +; VI-NEXT: s_or_b32 s1, s1, s7 ; VI-NEXT: s_or_b32 s0, s0, s5 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s10 +; VI-NEXT: s_or_b32 s2, s2, s8 +; VI-NEXT: s_or_b32 s1, s1, s6 ; VI-NEXT: s_or_b32 s0, s0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1905,138 +1902,138 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v4, 16 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: s_lshr_b32 s8, s3, 24 +; VI-NEXT: s_lshr_b32 s8, s4, 16 +; VI-NEXT: s_lshr_b32 s9, s4, 24 +; VI-NEXT: s_lshr_b32 s10, s5, 16 +; VI-NEXT: s_lshr_b32 s11, s5, 24 +; VI-NEXT: s_lshr_b32 s12, s6, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 24 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s7, 24 +; VI-NEXT: s_bfe_u32 s24, s4, 0x80008 +; VI-NEXT: s_bfe_u32 s25, s5, 0x80008 +; VI-NEXT: s_bfe_u32 s26, s6, 0x80008 +; VI-NEXT: s_bfe_u32 s27, s7, 0x80008 +; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_add_i32 s14, s14, s14 +; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_add_i32 s10, s10, s10 ; VI-NEXT: s_add_i32 s9, s9, s9 -; VI-NEXT: s_bfe_u32 s24, s3, 0x80008 ; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshr_b32 s16, s0, 16 +; VI-NEXT: s_lshr_b32 s17, s0, 24 +; VI-NEXT: s_lshr_b32 s18, s1, 16 +; VI-NEXT: s_lshr_b32 s19, s1, 24 +; VI-NEXT: s_lshr_b32 s20, s2, 16 +; VI-NEXT: s_lshr_b32 s21, s2, 24 +; VI-NEXT: s_lshr_b32 s22, s3, 16 +; VI-NEXT: s_lshr_b32 s23, s3, 24 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s27, s27, s27 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_add_i32 s26, s26, s26 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_add_i32 s25, s25, s25 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s24, s24, s24 -; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_lshl_b32 s8, s8, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s11, s2, 16 +; VI-NEXT: s_bfe_u32 s28, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s29, s1, 0x80008 +; VI-NEXT: s_bfe_u32 s30, s2, 0x80008 +; VI-NEXT: s_bfe_u32 s31, s3, 0x80008 +; VI-NEXT: s_add_i32 s23, s23, s23 +; VI-NEXT: s_add_i32 s22, s22, s22 +; VI-NEXT: s_add_i32 s21, s21, s21 +; VI-NEXT: s_add_i32 s20, s20, s20 +; VI-NEXT: s_add_i32 s19, s19, s19 +; VI-NEXT: s_add_i32 s18, s18, s18 +; VI-NEXT: s_add_i32 s17, s17, s17 +; VI-NEXT: s_add_i32 s16, s16, s16 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s15, s27, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s11, s25, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s9, s24, 8 -; VI-NEXT: s_lshr_b32 s10, s2, 24 -; VI-NEXT: s_add_i32 s11, s11, s11 -; VI-NEXT: s_or_b32 s3, s3, s9 -; VI-NEXT: s_bfe_u32 s25, s2, 0x80008 -; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s9, s11, 0xff -; VI-NEXT: s_add_i32 s25, s25, s25 +; VI-NEXT: s_lshl_b32 s23, s23, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_add_i32 s31, s31, s31 +; VI-NEXT: s_lshl_b32 s21, s21, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_or_b32 s3, s3, s8 -; VI-NEXT: s_lshl_b32 s8, s10, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s13, s1, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_lshr_b32 s12, s1, 24 -; VI-NEXT: s_add_i32 s13, s13, s13 -; VI-NEXT: s_or_b32 s2, s2, s9 -; VI-NEXT: s_bfe_u32 s26, s1, 0x80008 -; VI-NEXT: s_add_i32 s12, s12, s12 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s9, s13, 0xff -; VI-NEXT: s_add_i32 s26, s26, s26 +; VI-NEXT: s_add_i32 s30, s30, s30 +; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_or_b32 s2, s2, s8 -; VI-NEXT: s_lshl_b32 s8, s12, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s15, s0, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s9, s26, 8 -; VI-NEXT: s_lshr_b32 s14, s0, 24 -; VI-NEXT: s_add_i32 s15, s15, s15 -; VI-NEXT: s_or_b32 s1, s1, s9 -; VI-NEXT: s_bfe_u32 s27, s0, 0x80008 -; VI-NEXT: s_add_i32 s14, s14, s14 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s9, s15, 0xff -; VI-NEXT: s_add_i32 s27, s27, s27 +; VI-NEXT: s_add_i32 s29, s29, s29 +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s8 -; VI-NEXT: s_lshl_b32 s8, s14, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s17, s7, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s9, s27, 8 -; VI-NEXT: s_lshr_b32 s16, s7, 24 -; VI-NEXT: s_add_i32 s17, s17, s17 -; VI-NEXT: s_or_b32 s0, s0, s9 -; VI-NEXT: s_bfe_u32 s28, s7, 0x80008 -; VI-NEXT: s_add_i32 s16, s16, s16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_and_b32 s9, s17, 0xff ; VI-NEXT: s_add_i32 s28, s28, s28 -; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: s_or_b32 s0, s0, s8 -; VI-NEXT: s_lshl_b32 s8, s16, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s19, s6, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s28, 8 -; VI-NEXT: s_lshr_b32 s18, s6, 24 -; VI-NEXT: s_add_i32 s19, s19, s19 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_bfe_u32 s29, s6, 0x80008 -; VI-NEXT: s_add_i32 s18, s18, s18 +; VI-NEXT: s_or_b32 s7, s7, s15 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s5, s5, s11 +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s23, s31, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s21, s30, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s19, s29, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s17, s28, 8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s9, s19, 0xff -; VI-NEXT: s_add_i32 s29, s29, s29 -; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_lshl_b32 s8, s18, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s21, s5, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s9, s29, 8 -; VI-NEXT: s_lshr_b32 s20, s5, 24 -; VI-NEXT: s_add_i32 s21, s21, s21 -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_bfe_u32 s30, s5, 0x80008 -; VI-NEXT: s_add_i32 s20, s20, s20 +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s9, s21, 0xff -; VI-NEXT: s_add_i32 s30, s30, s30 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_lshl_b32 s8, s20, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s23, s4, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s9, s30, 8 -; VI-NEXT: s_lshr_b32 s22, s4, 24 -; VI-NEXT: s_add_i32 s23, s23, s23 -; VI-NEXT: s_or_b32 s5, s5, s9 -; VI-NEXT: s_bfe_u32 s31, s4, 0x80008 -; VI-NEXT: s_add_i32 s22, s22, s22 +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_and_b32 s9, s23, 0xff -; VI-NEXT: s_add_i32 s31, s31, s31 -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s22, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s9, s31, 8 -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s23 +; VI-NEXT: s_or_b32 s2, s2, s21 +; VI-NEXT: s_or_b32 s1, s1, s19 +; VI-NEXT: s_or_b32 s0, s0, s17 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_or_b32 s6, s6, s12 +; VI-NEXT: s_or_b32 s5, s5, s10 ; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_lshl_b32 s22, s22, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_or_b32 s3, s3, s22 +; VI-NEXT: s_or_b32 s2, s2, s20 +; VI-NEXT: s_or_b32 s1, s1, s18 +; VI-NEXT: s_or_b32 s0, s0, s16 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index f712421083e6b..df35a4e4bcc75 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; This particular case will actually be worse in terms of code size ; from sinking into both. @@ -116,21 +116,15 @@ ret: ; OPT: store ; OPT: ret -; For GFX8: since i16 is legal type, we cannot sink lshr into .LBBs. - ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x2c -; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} ; GCN: ; %bb.1: -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7f +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 ; GCN: .LBB2_2: -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 ; GCN: buffer_store_short ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 00f74f50a4b8b..52c90817dddd1 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1623,15 +1623,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1669,11 +1664,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1684,11 +1675,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, -16 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm @@ -1700,13 +1690,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 92ece0d007fe2..773369b7a5beb 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -652,14 +652,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v3 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -760,16 +759,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index f0c278a67c8bc..7f83fc571bf29 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1402,15 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1448,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1463,9 +1455,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 3c45596fba14b..6b1551a88df5c 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -629,9 +629,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -731,9 +730,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 @@ -1460,13 +1458,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x100, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -1503,14 +1496,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v3 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -1557,19 +1550,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_readfirstlane_b32 s2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s3, v0 -; VI-NEXT: s_lshl_b32 s2, s2, 8 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s3, s2, 0x10000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_ff1_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, s3, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1611,9 +1597,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 07a7d8d20c439..7262724064918 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -647,12 +647,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -731,11 +726,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 -; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_f16_e64 v1, 2.0, s2 +; VI-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 5a9259efc0cc8..9a93b1d8b5909 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s2, 0xffff -; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s2, s2, s5 -; VI-NEXT: s_lshr_b32 s3, s4, s3 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshr_b32 s4, s5, s4 +; VI-NEXT: s_lshr_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 201b85c745c18..05ffaf62ff1e0 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -454,15 +454,15 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[8:9], 0x28 -; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dword s2, s[8:9], 0x4c +; VI-NEXT: s_load_dword s3, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 -; VI-NEXT: s_min_i32 s2, s2, s3 +; VI-NEXT: s_min_i32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -472,14 +472,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 -; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x4c +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x28 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 ; GFX9-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NEXT: s_min_i32 s2, s2, s3 +; GFX9-NEXT: s_min_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -487,14 +487,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 -; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x4c +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x28 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 ; GFX10-NEXT: s_sext_i32_i8 s3, s3 -; GFX10-NEXT: s_min_i32 s2, s2, s3 +; GFX10-NEXT: s_min_i32 s2, s3, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -502,15 +502,15 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x28 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sext_i32_i8 s2, s2 ; GFX11-NEXT: s_sext_i32_i8 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s2, s3, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -614,30 +614,32 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 -; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 -; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 -; VI-NEXT: s_bfe_i32 s6, s2, 0x80008 -; VI-NEXT: s_sext_i32_i8 s2, s2 -; VI-NEXT: s_ashr_i32 s7, s3, 24 -; VI-NEXT: s_bfe_i32 s8, s3, 0x80010 -; VI-NEXT: s_bfe_i32 s9, s3, 0x80008 +; VI-NEXT: s_ashr_i32 s6, s3, 24 +; VI-NEXT: s_min_i32 s4, s4, s6 +; VI-NEXT: s_bfe_i32 s6, s3, 0x80010 +; VI-NEXT: s_bfe_i32 s8, s2, 0x80010 +; VI-NEXT: s_min_i32 s6, s8, s6 +; VI-NEXT: s_sext_i32_i16 s5, s2 +; VI-NEXT: s_sext_i32_i16 s7, s3 +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_ashr_i32 s6, s7, 8 +; VI-NEXT: s_ashr_i32 s5, s5, 8 ; VI-NEXT: s_sext_i32_i8 s3, s3 +; VI-NEXT: s_sext_i32_i8 s2, s2 +; VI-NEXT: s_min_i32 s5, s5, s6 ; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_min_i32 s3, s6, s9 -; VI-NEXT: s_min_i32 s5, s5, s8 -; VI-NEXT: s_min_i32 s4, s4, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s4, s4, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_or_b32 s2, s2, s5 +; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -818,11 +820,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s4, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_ashr_i32 s5, s3, 16 +; VI-NEXT: s_ashr_i32 s4, s3, 16 +; VI-NEXT: s_ashr_i32 s5, s2, 16 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_min_i32 s4, s4, s5 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_min_i32 s4, s5, s4 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff @@ -935,24 +937,24 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s8, s3, 16 +; VI-NEXT: s_ashr_i32 s6, s3, 16 +; VI-NEXT: s_ashr_i32 s7, s1, 16 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_ashr_i32 s7, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s9, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_min_i32 s6, s6, s8 +; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_min_i32 s6, s7, s6 ; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_min_i32 s7, s7, s9 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_lshl_b32 s2, s6, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s7, 16 +; VI-NEXT: s_or_b32 s1, s1, s6 +; VI-NEXT: s_ashr_i32 s3, s2, 16 +; VI-NEXT: s_ashr_i32 s6, s0, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_min_i32 s3, s6, s3 +; VI-NEXT: s_min_i32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s3, s3, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_or_b32 s0, s0, s3 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2726,19 +2728,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v5, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_ushort v4, v[2:3] +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -2752,7 +2757,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -2768,29 +2773,54 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX10-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: global_store_byte v0, v2, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_umin_ult_i16_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[4:5] -; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_umin_ult_i16_multi_use: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[6:7] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v0, s[4:5] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_umin_ult_i16_multi_use: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %aptr, align 2 %b = load i16, ptr addrspace(1) %bptr, align 2 %cmp = icmp ult i16 %a, %b @@ -3155,38 +3185,38 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s10, s3, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshr_b32 s11, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshr_b32 s12, s1, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_lshr_b32 s13, s0, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshr_b32 s16, s5, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshr_b32 s17, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_min_u32 s0, s0, s4 -; VI-NEXT: s_min_u32 s4, s13, s17 -; VI-NEXT: s_min_u32 s1, s1, s5 -; VI-NEXT: s_min_u32 s5, s12, s16 -; VI-NEXT: s_min_u32 s2, s2, s6 -; VI-NEXT: s_min_u32 s6, s11, s15 +; VI-NEXT: s_and_b32 s10, s7, 0xffff +; VI-NEXT: s_and_b32 s11, s3, 0xffff +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s3, s3, 16 ; VI-NEXT: s_min_u32 s3, s3, s7 -; VI-NEXT: s_min_u32 s7, s10, s14 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s3, s3, s7 -; VI-NEXT: s_or_b32 s2, s2, s6 -; VI-NEXT: s_or_b32 s1, s1, s5 -; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: s_min_u32 s10, s11, s10 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s3, s10, s3 +; VI-NEXT: s_and_b32 s7, s6, 0xffff +; VI-NEXT: s_and_b32 s10, s2, 0xffff +; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_min_u32 s2, s2, s6 +; VI-NEXT: s_min_u32 s7, s10, s7 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s2, s7, s2 +; VI-NEXT: s_and_b32 s6, s5, 0xffff +; VI-NEXT: s_and_b32 s7, s1, 0xffff +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s1, s1, 16 +; VI-NEXT: s_min_u32 s1, s1, s5 +; VI-NEXT: s_min_u32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s1, s6, s1 +; VI-NEXT: s_and_b32 s5, s4, 0xffff +; VI-NEXT: s_and_b32 s6, s0, 0xffff +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: s_min_u32 s0, s0, s4 +; VI-NEXT: s_min_u32 s5, s6, s5 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s5, s0 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3529,9 +3559,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s3, s2 -; VI-NEXT: s_ashr_i32 s2, s2, 16 -; VI-NEXT: s_min_i32 s2, s3, s2 +; VI-NEXT: s_ashr_i32 s3, s2, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -3544,9 +3574,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_min_i32 s2, s3, s2 +; GFX9-NEXT: s_ashr_i32 s3, s2, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: s_min_i32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -3558,9 +3588,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s3, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_min_i32 s2, s3, s2 +; GFX10-NEXT: s_ashr_i32 s3, s2, 16 +; GFX10-NEXT: s_sext_i32_i16 s2, s2 +; GFX10-NEXT: s_min_i32 s2, s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -3572,10 +3602,10 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s3, s2 -; GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GFX11-NEXT: s_ashr_i32 s3, s2, 16 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s3, s2 +; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index d999945948101..38e45042b5ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; NOSDWA-NEXT: flat_load_dword v1, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 -; NOSDWA-NEXT: s_waitcnt vmcnt(1) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, v1, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v4 -; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2 +; NOSDWA-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 ; NOSDWA-NEXT: s_endpgm @@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX89-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX89-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_add_u32_e32 v3, vcc, v1, v2 +; GFX89-NEXT: v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index 4e3dccb975fe8..cc07ee4ee4780 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -521,13 +521,10 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o ; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16: ; GCN: s_load_dword [[VAL:s[0-9]+]] -; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000 -; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] -; SI: buffer_store_short [[VBFE]] +; GCN: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000 +; GCN: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] +; GCN: buffer_store_short [[VBFE]] -; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { %ld = load i32, ptr addrspace(4) %ptr %in = trunc i32 %ld to i16 @@ -622,9 +619,7 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 % ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]] ; SI: buffer_store_short [[VSEXT]] -; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} -; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} +; GFX89: s_bfe_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000 define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 { %shl = shl i16 %in, 8 %sext = ashr i16 %shl, 8 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 4b616e836f916..1c5c16d886251 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_lshl_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshl_b32 s0, s1, s0 ; VI-NEXT: s_lshl_b32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index fe47663b11028..6ca8f490ff165 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -22,17 +22,17 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_sub_i32 s4, 0, s2 +; VI-NEXT: s_sub_i32 s3, 0, s2 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_ashr_i32 s5, s2, 16 +; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sub_i32 s3, 0, s3 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_max_i32 s2, s2, s4 -; VI-NEXT: s_max_i32 s3, s5, s3 +; VI-NEXT: s_max_i32 s2, s2, s3 +; VI-NEXT: s_max_i32 s4, s5, s4 ; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 @@ -171,17 +171,17 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_sub_i32 s4, 0, s2 +; VI-NEXT: s_sub_i32 s3, 0, s2 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_ashr_i32 s5, s2, 16 +; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sub_i32 s3, 0, s3 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_max_i32 s2, s2, s4 -; VI-NEXT: s_max_i32 s3, s5, s3 +; VI-NEXT: s_max_i32 s2, s2, s3 +; VI-NEXT: s_max_i32 s4, s5, s4 ; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 @@ -331,31 +331,31 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_sub_i32 s6, 0, s3 -; VI-NEXT: s_sub_i32 s7, 0, s2 -; VI-NEXT: s_sub_i32 s5, 0, s5 -; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_lshr_b32 s7, s2, 16 +; VI-NEXT: s_sub_i32 s7, 0, s7 +; VI-NEXT: s_sub_i32 s4, 0, s3 +; VI-NEXT: s_lshr_b32 s6, s3, 16 ; VI-NEXT: s_ashr_i32 s8, s2, 16 -; VI-NEXT: s_ashr_i32 s9, s3, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: s_sext_i32_i16 s6, s6 +; VI-NEXT: s_sub_i32 s5, 0, s2 +; VI-NEXT: s_sub_i32 s6, 0, s6 +; VI-NEXT: s_max_i32 s7, s8, s7 +; VI-NEXT: s_ashr_i32 s8, s3, 16 ; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_sext_i32_i16 s6, s6 ; VI-NEXT: s_sext_i32_i16 s5, s5 -; VI-NEXT: s_max_i32 s3, s3, s6 -; VI-NEXT: s_max_i32 s2, s2, s7 -; VI-NEXT: s_max_i32 s5, s9, s5 -; VI-NEXT: s_max_i32 s4, s8, s4 -; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_max_i32 s3, s3, s4 +; VI-NEXT: s_max_i32 s6, s8, s6 +; VI-NEXT: s_max_i32 s2, s2, s5 ; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s4, s6, 16 ; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_or_b32 s3, s4, s3 +; VI-NEXT: s_lshl_b32 s4, s7, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s3, s5, s3 ; VI-NEXT: s_or_b32 s2, s4, s2 ; VI-NEXT: s_add_i32 s3, s3, 0x20000 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 @@ -560,23 +560,23 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_ashr_i32 s0, s4, 16 -; VI-NEXT: s_sext_i32_i16 s1, s4 -; VI-NEXT: s_ashr_i32 s2, s5, 16 +; VI-NEXT: s_ashr_i32 s0, s5, 16 +; VI-NEXT: s_ashr_i32 s1, s4, 16 ; VI-NEXT: s_sext_i32_i16 s3, s5 -; VI-NEXT: s_max_i32 s4, s0, s2 -; VI-NEXT: s_max_i32 s5, s1, s3 -; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_max_i32 s2, s1, s0 +; VI-NEXT: s_max_i32 s5, s4, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_min_i32 s0, s1, s0 +; VI-NEXT: s_min_i32 s1, s4, s3 +; VI-NEXT: s_or_b32 s2, s5, s2 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff ; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -661,12 +661,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[2:3], v4 @@ -748,37 +748,37 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_ashr_i32 s0, s5, 16 -; VI-NEXT: s_ashr_i32 s1, s4, 16 -; VI-NEXT: s_sext_i32_i16 s2, s5 -; VI-NEXT: s_sext_i32_i16 s3, s4 -; VI-NEXT: s_ashr_i32 s4, s7, 16 -; VI-NEXT: s_ashr_i32 s5, s6, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: s_ashr_i32 s0, s7, 16 +; VI-NEXT: s_ashr_i32 s1, s5, 16 +; VI-NEXT: s_sext_i32_i16 s3, s7 +; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_max_i32 s2, s1, s0 +; VI-NEXT: s_max_i32 s7, s5, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s2, s7, s2 +; VI-NEXT: s_ashr_i32 s7, s6, 16 +; VI-NEXT: s_ashr_i32 s8, s4, 16 ; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_max_i32 s8, s1, s5 -; VI-NEXT: s_max_i32 s9, s0, s4 -; VI-NEXT: s_max_i32 s10, s3, s6 -; VI-NEXT: s_max_i32 s11, s2, s7 -; VI-NEXT: s_min_i32 s0, s0, s4 -; VI-NEXT: s_min_i32 s2, s2, s7 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_min_i32 s0, s1, s0 +; VI-NEXT: s_min_i32 s1, s5, s3 +; VI-NEXT: s_max_i32 s9, s8, s7 +; VI-NEXT: s_max_i32 s10, s4, s6 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_min_i32 s1, s1, s5 -; VI-NEXT: s_min_i32 s3, s3, s6 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s9, s11, s9 -; VI-NEXT: s_or_b32 s8, s10, s8 -; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_min_i32 s1, s8, s7 +; VI-NEXT: s_min_i32 s2, s4, s6 +; VI-NEXT: s_or_b32 s9, s10, s9 ; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s2, s3, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: s_or_b32 s1, s2, s1 ; VI-NEXT: v_mov_b32_e32 v6, s1 ; VI-NEXT: v_mov_b32_e32 v7, s0 @@ -899,42 +899,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_readfirstlane_b32 s0, v4 -; VI-NEXT: v_readfirstlane_b32 s1, v5 -; VI-NEXT: s_ashr_i32 s3, s0, 16 -; VI-NEXT: s_ashr_i32 s5, s1, 16 -; VI-NEXT: s_cmp_gt_i32 s3, s5 -; VI-NEXT: s_sext_i32_i16 s2, s0 -; VI-NEXT: s_sext_i32_i16 s4, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, s3, s5 -; VI-NEXT: s_cselect_b32 s3, s5, s3 -; VI-NEXT: s_lshl_b32 s5, s0, 16 -; VI-NEXT: s_cmp_gt_i32 s2, s4 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, s2, s4 -; VI-NEXT: s_cselect_b32 s1, s4, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v4 -; VI-NEXT: s_lshl_b32 s2, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s5 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_ashrrev_i32_e32 v10, 16, v4 +; VI-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; VI-NEXT: v_bfe_i32 v6, v4, 0, 16 +; VI-NEXT: v_bfe_i32 v7, v5, 0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, v10, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], v6, v7 +; VI-NEXT: v_cndmask_b32_e64 v6, v5, v4, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; VI-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v6 +; VI-NEXT: v_or_b32_e32 v0, v9, v5 +; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 3, v0 +; VI-NEXT: flat_store_dword v[2:3], v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_byte v[0:1], v4 +; VI-NEXT: flat_store_byte v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; @@ -1020,23 +1012,23 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_lshr_b32 s0, s4, 16 -; VI-NEXT: s_lshr_b32 s2, s5, 16 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_and_b32 s1, s4, 0xffff -; VI-NEXT: s_and_b32 s3, s5, 0xffff -; VI-NEXT: s_max_u32 s5, s0, s2 -; VI-NEXT: s_max_u32 s4, s1, s3 +; VI-NEXT: s_lshr_b32 s3, s5, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s0, s5, 0xffff +; VI-NEXT: s_max_u32 s5, s4, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_max_u32 s2, s1, s0 ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_min_u32 s0, s0, s2 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_min_u32 s1, s1, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_min_u32 s0, s1, s0 +; VI-NEXT: s_min_u32 s1, s4, s3 +; VI-NEXT: s_or_b32 s2, s2, s5 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 67c51286de216..68ed7cecd8ff7 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -187,15 +187,14 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s0, v1 -; VI-NEXT: v_readfirstlane_b32 s1, v0 -; VI-NEXT: s_ashr_i32 s2, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: v_readfirstlane_b32 s1, v1 +; VI-NEXT: s_lshr_b32 s2, s1, 16 ; VI-NEXT: s_ashr_i32 s3, s0, 16 ; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s0, s1, s0 -; VI-NEXT: s_ashr_i32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_ashr_i32 s2, s3, s2 +; VI-NEXT: s_ashr_i32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -294,28 +293,26 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s4, v2 -; VI-NEXT: v_readfirstlane_b32 s5, v3 -; VI-NEXT: v_readfirstlane_b32 s6, v0 -; VI-NEXT: v_readfirstlane_b32 s7, v1 -; VI-NEXT: s_ashr_i32 s8, s7, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_lshr_b32 s8, s7, 16 ; VI-NEXT: s_ashr_i32 s9, s6, 16 ; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_ashr_i32 s10, s5, 16 -; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: s_lshr_b32 s10, s5, 16 ; VI-NEXT: s_ashr_i32 s11, s4, 16 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_ashr_i32 s4, s6, s4 -; VI-NEXT: s_ashr_i32 s6, s9, s11 -; VI-NEXT: s_ashr_i32 s5, s7, s5 -; VI-NEXT: s_ashr_i32 s7, s8, s10 +; VI-NEXT: s_ashr_i32 s8, s9, s8 +; VI-NEXT: s_ashr_i32 s6, s6, s7 +; VI-NEXT: s_ashr_i32 s7, s11, s10 +; VI-NEXT: s_ashr_i32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_or_b32 s4, s4, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 44e403854217e..42bd2ff8797a1 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -116,23 +116,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_sub_i32 s2, s2, s3 -; VI-NEXT: s_sub_i32 s0, s0, s1 -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_sub_i32 s5, s6, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_sub_i32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16: @@ -230,9 +228,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_sub_i32 s0, s1, s0 ; VI-NEXT: s_sub_i32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index d41720e19c217..77d1e6c2593c1 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -317,7 +317,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s0, 0xffffff00 +; VI-NEXT: s_and_b32 s1, s0, 0xff00 ; VI-NEXT: s_add_i32 s0, s0, 12 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: s_and_b32 s0, s0, 0xff diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll index af50e09f509a3..c77828aa5606f 100644 --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI %s ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s ; R600: {{^}}s_mad_zext_i32_to_i64: @@ -53,7 +53,8 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}} ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}} -; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]] +; VI: s_cmp_eq_u32 [[MASK_B]], [[MASK_A]] +; SI: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]] ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_short [[RESULT]]