-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Do not promote uniform i16 operations to i32 in CGP #140208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesFor the majority of cases, this is a neutral or positive change. Solves #64591 Patch is 2.28 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140208.diff 53 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index efb2894aaf642..c966d4a57c77f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -50,7 +50,7 @@ static cl::opt<bool> Widen16BitOps(
"amdgpu-codegenprepare-widen-16-bit-ops",
cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
cl::ReallyHidden,
- cl::init(true));
+ cl::init(false));
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index c8b82716a9fe1..814acc3be1fc0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -281,12 +281,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat:
@@ -323,12 +323,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
; GFX8-NEXT: s_add_i32 s1, s1, 4
+; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo:
@@ -365,12 +365,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, 4
; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT: s_add_i32 s0, s0, 4
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi:
@@ -408,14 +408,13 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX8-LABEL: s_add_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s3
-; GFX8-NEXT: s_lshl_b32 s1, s2, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_add_i32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16:
@@ -461,14 +460,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
; GFX8: ; %bb.0:
; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s3
-; GFX8-NEXT: s_lshl_b32 s1, s2, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_add_i32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_fneg_lhs:
@@ -517,14 +515,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
; GFX8: ; %bb.0:
; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s3
-; GFX8-NEXT: s_lshl_b32 s1, s2, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_add_i32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_fneg_rhs:
@@ -580,14 +577,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s3
-; GFX8-NEXT: s_lshl_b32 s1, s2, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_add_i32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 82d87358e1faf..aea32b3fedba7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -70,30 +70,15 @@ define i8 @v_ashr_i8_7(i8 %value) {
}
define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) {
-; GFX6-LABEL: s_ashr_i8:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i8 s0, s0
-; GFX6-NEXT: s_ashr_i32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_ashr_i8:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i8 s0, s0
-; GFX8-NEXT: s_sext_i32_i8 s1, s1
-; GFX8-NEXT: s_ashr_i32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_ashr_i8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sext_i32_i8 s0, s0
-; GFX9-NEXT: s_sext_i32_i8 s1, s1
-; GFX9-NEXT: s_ashr_i32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_ashr_i8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_sext_i32_i8 s0, s0
+; GCN-NEXT: s_ashr_i32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i8:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT: s_sext_i32_i8 s1, s1
; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i8 %value, %amount
@@ -642,30 +627,15 @@ define i16 @v_ashr_i16_15(i16 %value) {
}
define amdgpu_ps i16 @s_ashr_i16(i16 inreg %value, i16 inreg %amount) {
-; GFX6-LABEL: s_ashr_i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i16 s0, s0
-; GFX6-NEXT: s_ashr_i32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_ashr_i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_ashr_i32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_ashr_i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sext_i32_i16 s0, s0
-; GFX9-NEXT: s_sext_i32_i16 s1, s1
-; GFX9-NEXT: s_ashr_i32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_ashr_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NEXT: s_ashr_i32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i16:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT: s_sext_i32_i16 s1, s1
; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i16 %value, %amount
@@ -826,14 +796,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX8-LABEL: s_ashr_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s3, s1
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT: s_ashr_i32 s2, s2, s3
+; GFX8-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_ashr_i32 s0, s0, s1
-; GFX8-NEXT: s_lshl_b32 s0, s0, 16
-; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT: s_sext_i32_i16 s1, s2
+; GFX8-NEXT: s_ashr_i32 s1, s1, s3
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
@@ -1028,23 +999,25 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX8-LABEL: s_ashr_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s5, s1
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s6, s2
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s7, s3
-; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
-; GFX8-NEXT: s_ashr_i32 s4, s4, s6
+; GFX8-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: s_ashr_i32 s0, s0, s2
-; GFX8-NEXT: s_ashr_i32 s2, s5, s7
+; GFX8-NEXT: s_sext_i32_i16 s2, s4
+; GFX8-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NEXT: s_ashr_i32 s2, s2, s6
+; GFX8-NEXT: s_sext_i32_i16 s1, s1
+; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_ashr_i32 s1, s1, s3
-; GFX8-NEXT: s_lshl_b32 s0, s0, 16
-; GFX8-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s0, s3
+; GFX8-NEXT: s_sext_i32_i16 s3, s5
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_ashr_i32 s3, s3, s7
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;
@@ -1235,41 +1208,45 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX8-LABEL: s_ashr_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s9, s1
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s12, s4
-; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s13, s5
-; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s10, s2
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s14, s6
-; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010
+; GFX8-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_lshr_b32 s12, s4, 16
; GFX8-NEXT: s_ashr_i32 s0, s0, s4
-; GFX8-NEXT: s_ashr_i32 s4, s9, s13
+; GFX8-NEXT: s_sext_i32_i16 s4, s8
+; GFX8-NEXT: s_lshr_b32 s9, s1, 16
+; GFX8-NEXT: s_ashr_i32 s4, s4, s12
+; GFX8-NEXT: s_sext_i32_i16 s1, s1
+; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_ashr_i32 s1, s1, s5
-; GFX8-NEXT: s_sext_i32_i16 s11, s3
-; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
-; GFX8-NEXT: s_sext_i32_i16 s15, s7
-; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010
-; GFX8-NEXT: s_ashr_i32 s5, s10, s14
+; GFX8-NEXT: s_sext_i32_i16 s5, s9
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_lshr_b32 s10, s2, 16
+; GFX8-NEXT: s_ashr_i32 s5, s5, s13
+; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_ashr_i32 s2, s2, s6
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX8-NEXT: s_ashr_i32 s8, s8, s12
-; GFX8-NEXT: s_ashr_i32 s6, s11, s15
+; GFX8-NEXT: s_sext_i32_i16 s6, s10
+; GFX8-NEXT: s_or_b32 s0, s0, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT: s_lshr_b32 s11, s3, 16
+; GFX8-NEXT: s_ashr_i32 s6, s6, s14
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_lshr_b32 s15, s7, 16
; GFX8-NEXT: s_ashr_i32 s3, s3, s7
+; GFX8-NEXT: s_sext_i32_i16 s7, s11
; GFX8-NEXT: s_or_b32 s1, s1, s4
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_and_b32 s4, s5, 0xffff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 16
-; GFX8-NEXT: s_and_b32 s7, s8, 0xffff
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_ashr_i32 s7, s7, s15
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_or_b32 s2, s2, s4
-; GFX8-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NEXT: s_and_b32 s4, s6, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s0, s7
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_or_b32 s3, s3, s4
; GFX8-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 7fa0d23e55938..be1dc7f0c67f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -85,14 +85,27 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
}
define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
-; GCN-LABEL: s_lshr_i8_7:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8_7:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10007
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8_7:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NEXT: s_lshr_b32 s0, s0, 7
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8_7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 7
+; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i8_7:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x10007
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 7
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i8 %value, 7
ret i8 %result
@@ -619,15 +632,27 @@ define i16 @v_lshr_i16_15(i16 %value) {
}
define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
-; GCN-LABEL: s_lshr_i16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshr_b32 s0, s0, s1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s0, s0, s1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, s1
+; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i16:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i16 %value, %amount
@@ -635,14 +660,27 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
}
define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
-; GCN-LABEL: s_lshr_i16_15:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i16_15:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1000f
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i16_15:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 15
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i16_15:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 15
+; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i16_15:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1000f
+; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 15
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i16 %value, 15
ret i16 %result
@@ -783,13 +821,12 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
; GFX8-LABEL: s_lshr_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: s_lshr_b32 s1, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_lshr_v2i16:
@@ -970,21 +1007,19 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
; GFX8-LABEL: s_lshr_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s2
; GFX8-NEXT: s_lshr_b32 s2, s4, s6
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s1, s1, s3
; GFX8-NEXT: s_lshr_b32 s3, s5, s7
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT: s_or_b32 s0, s2, s0
+; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_lshl_b32 s2, s3, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT: s_or_b32 s1, s2, s1
+; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_lshr_v4i16:
@@ -1155,37 +1190,33 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
; GFX8-LABEL: s_lshr_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
; GFX8-NEXT: s_lshr_b32 s4, s8, s12
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_lshr_b32 s1, s1, s5
; GFX8-NEXT: s_lshr_b32 s5, s9, s13
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
; | ||
; GFX8-LABEL: s_lshr_i8_7: | ||
; GFX8: ; %bb.0: | ||
; GFX8-NEXT: s_and_b32 s0, s0, 0xff |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is what #140040 is about
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Next step we can delete the option, and then reverse the iteration order
Merge activity
|
cd12b2c
to
bb86037
Compare
For the majority of cases, this is a neutral or positive change. There are even testcases that greatly benefit from it, but some regressions are possible. There is #140040 for GlobalISel that'd need to be fixed but it's only a one instruction regression and I think it can be fixed later. Solves #64591
4f40044
to
4cd368e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
…lvm#140208)" This reverts commit aacebae.
…lvm#140208)" This reverts commit aacebae.
For the majority of cases, this is a neutral or positive change.
There are even testcases that greatly benefit from it, but some regressions are possible.
There is #140040 for GlobalISel that'd need to be fixed but it's only a one instruction regression and I think it can be fixed later.
Solves #64591