-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] Remove extra MOV after widening atomic load #138635
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/jofrn/spr/main/2894ccd1
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-x86 Author: None (jofrn) ChangesThis change adds patterns to optimize out an extra MOV Stack:
Full diff: https://github.com/llvm/llvm-project/pull/138635.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 167e27eddd71e..8ad8a0a6194d6 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1200,6 +1200,13 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), (MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (anyext (i16 (atomic_load_16 addr:$src)))))),
+ (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i16>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))),
+ (MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float>
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9ee8b4fc5ac7f..935d058a52f8f 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -149,8 +149,7 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
define <2 x i8> @atomic_vec2_i8(ptr %x) {
; CHECK3-LABEL: atomic_vec2_i8:
; CHECK3: ## %bb.0:
-; CHECK3-NEXT: movzwl (%rdi), %eax
-; CHECK3-NEXT: movd %eax, %xmm0
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK3-NEXT: retq
;
; CHECK0-LABEL: atomic_vec2_i8:
@@ -165,11 +164,15 @@ define <2 x i8> @atomic_vec2_i8(ptr %x) {
}
define <2 x i16> @atomic_vec2_i16(ptr %x) {
-; CHECK-LABEL: atomic_vec2_i16:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movl (%rdi), %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: retq
+; CHECK3-LABEL: atomic_vec2_i16:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec2_i16:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK0-NEXT: retq
%ret = load atomic <2 x i16>, ptr %x acquire, align 4
ret <2 x i16> %ret
}
@@ -177,8 +180,7 @@ define <2 x i16> @atomic_vec2_i16(ptr %x) {
define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
; CHECK-LABEL: atomic_vec2_ptr270:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq %rax, %xmm0
+; CHECK-NEXT: movq (%rdi), %xmm0
; CHECK-NEXT: retq
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
@@ -187,8 +189,7 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
; CHECK-LABEL: atomic_vec2_i32_align:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq %rax, %xmm0
+; CHECK-NEXT: movq (%rdi), %xmm0
; CHECK-NEXT: retq
%ret = load atomic <2 x i32>, ptr %x acquire, align 8
ret <2 x i32> %ret
@@ -197,8 +198,7 @@ define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
define <2 x float> @atomic_vec2_float_align(ptr %x) {
; CHECK-LABEL: atomic_vec2_float_align:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq %rax, %xmm0
+; CHECK-NEXT: movq (%rdi), %xmm0
; CHECK-NEXT: retq
%ret = load atomic <2 x float>, ptr %x acquire, align 8
ret <2 x float> %ret
@@ -354,11 +354,15 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
}
define <4 x i8> @atomic_vec4_i8(ptr %x) nounwind {
-; CHECK-LABEL: atomic_vec4_i8:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movl (%rdi), %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: retq
+; CHECK3-LABEL: atomic_vec4_i8:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec4_i8:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK0-NEXT: retq
%ret = load atomic <4 x i8>, ptr %x acquire, align 4
ret <4 x i8> %ret
}
@@ -366,8 +370,7 @@ define <4 x i8> @atomic_vec4_i8(ptr %x) nounwind {
define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
; CHECK-LABEL: atomic_vec4_i16:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq %rax, %xmm0
+; CHECK-NEXT: movq (%rdi), %xmm0
; CHECK-NEXT: retq
%ret = load atomic <4 x i16>, ptr %x acquire, align 8
ret <4 x i16> %ret
|
4383732
to
0fcd430
Compare
0b8a020
to
561e379
Compare
@@ -1200,6 +1200,13 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), (MOV16rm addr:$src)>; | ||
def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>; | ||
def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>; | ||
|
||
def : Pat<(v4i32 (scalar_to_vector (i32 (anyext (i16 (atomic_load_16 addr:$src)))))), | ||
(MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this will dereference 32-bits
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Switched it to a zext
and now it dereferences 16 bits in the asm. Thanks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Next thing is to add SSE/AVX handling - I've added better test coverage at d27d0c7
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without loss of generality, do we not need the v2
in --check-prefixes=CHECK,CHECKv2-O0
due to divergence of asm?
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you'll have to add extra check-prefixes - base + v2 can share CHECK-SSE-O* and v3/4 can share a CHECK-AVX-O* prefixes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. Thanks!
Since you've made the commits in already, I'll interleave the SSE/AVX updates throughout the series rather than making a new PR.
561e379
to
99a560f
Compare
5b5d948
to
939a68f
Compare
99a560f
to
45d0296
Compare
939a68f
to
b6c4b48
Compare
e961155
to
c46962c
Compare
7e560d9
to
e8dc4c2
Compare
7fc4840
to
2ad7651
Compare
e5413e4
to
6312f8c
Compare
…ctor atomic memory operations Help #138635 where we need to ensure correct SSE/AVX load instructions
6312f8c
to
109bc60
Compare
109bc60
to
21475ae
Compare
This change adds patterns to optimize out an extra MOV present after widening the atomic load. commit-id:45989503
c315792
to
768b1a9
Compare
21475ae
to
b2b23bf
Compare
This change adds patterns to optimize out an extra MOV
present after widening the atomic load.
Stack: