-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Skip storing of stack arguments when lowering tail calls #126735
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesThis issue starts in the selection DAG and causes the backend to emit the following for a trivial tail call:
I'm not too sure that checking for immutability of a specific stack object is a good enough of a gurantee, because as soon a tail-call is done lowering, This can be extended to the ARM backend as well. Full diff: https://github.com/llvm/llvm-project/pull/126735.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0d1608a97bfd3..2a325dd418788 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9328,10 +9328,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset + BEAlign;
- SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
- PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
if (IsTailCall) {
+ // When the frame pointer is perfectly aligned for the tail call and the
+ // same stack argument is passed down, omit storing it if is immutable
+ // and already in the right offset.
+ if (FPDiff == 0) {
+ if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
+ if (auto *FINode =
+ dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = FINode->getIndex();
+ if (LoadNode->getMemoryVT() == VA.getValVT() &&
+ MFI.isImmutableObjectIndex(FI) &&
+ Offset == MFI.getObjectOffset(FI))
+ continue;
+ }
+ }
+ }
+
Offset = Offset + FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
diff --git a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
index 5d3c755d0d73d..61c25edded2ac 100644
--- a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
+++ b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
@@ -5,11 +5,11 @@
; CHECK-LABEL: __ZThn16_N1C3addEPKcz:
; CHECK: b __ZN1C3addEPKcz
+
; CHECK-LABEL: _tailTest:
; CHECK: b __ZN1C3addEPKcz
+
; CHECK-LABEL: __ZThn8_N1C1fEiiiiiiiiiz:
-; CHECK: ldr w9, [sp, #4]
-; CHECK: str w9, [sp, #4]
; CHECK: b __ZN1C1fEiiiiiiiiiz
%class.C = type { %class.A.base, [4 x i8], %class.B.base, [4 x i8] }
diff --git a/llvm/test/CodeGen/AArch64/scavenge-large-call.ll b/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
index 0c9bdd098aa2a..0cbdd087a5b96 100644
--- a/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
+++ b/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
@@ -4,7 +4,7 @@
; CHECK: add {{x[0-9]+}}, sp,
define void @caller(ptr %0, i16 %1, i16 %2, i8 %3, double %4, i16 %5, i8 %6, ptr %7, double %8, i32 %9, ptr %10, double %11, double %12, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double %16, double %17, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double %22, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] %26, i8 %27, i16 %28, i16 %29, i16 %30, i32 %31, [2 x i64] %32, [2 x i64] %33, [2 x i64] %34, [2 x i64] %35, [2 x i64] %36, i32 %37, i32 %38) {
- tail call void @callee(ptr %0, i16 %1, i16 %2, i8 %3, double 0.000000e+00, i16 %5, i8 %6, ptr %7, double 0.000000e+00, i32 %9, ptr %10, double 0.000000e+00, double 0.000000e+00, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double 0.000000e+00, double 0.000000e+00, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double 0.000000e+00, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] zeroinitializer, i8 %27, i16 0, i16 0, i16 %28, i32 0, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] %35, [2 x i64] %36, i32 0, i32 0)
+ call void @callee(ptr %0, i16 %1, i16 %2, i8 %3, double 0.000000e+00, i16 %5, i8 %6, ptr %7, double 0.000000e+00, i32 %9, ptr %10, double 0.000000e+00, double 0.000000e+00, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double 0.000000e+00, double 0.000000e+00, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double 0.000000e+00, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] zeroinitializer, i8 %27, i16 0, i16 0, i16 %28, i32 0, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] %35, [2 x i64] %36, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
index 1bd688d23050b..c4d886336cf68 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
@@ -11,67 +11,66 @@ target triple = "aarch64-unknown-linux-gnu"
define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
; CHECK-LABEL: func1:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w21, -24
-; CHECK-NEXT: .cfi_offset w22, -32
-; CHECK-NEXT: .cfi_offset w29, -48
+; CHECK-NEXT: sub sp, sp, #368
+; CHECK-NEXT: stp x29, x30, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #352] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #336
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: add x10, sp, #176
-; CHECK-NEXT: add x8, sp, #48
-; CHECK-NEXT: add x9, sp, #144
-; CHECK-NEXT: add x20, sp, #176
-; CHECK-NEXT: ldr x15, [sp, #104]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x10]
+; CHECK-NEXT: add x8, x29, #32
+; CHECK-NEXT: add x9, x29, #136
+; CHECK-NEXT: mov x12, #32 // =0x20
+; CHECK-NEXT: ldp x10, x11, [x29, #336]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
-; CHECK-NEXT: add x8, sp, #112
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9]
+; CHECK-NEXT: add x8, x29, #72
+; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8]
-; CHECK-NEXT: ldur q4, [sp, #88]
-; CHECK-NEXT: ldp x9, x8, [sp, #328]
-; CHECK-NEXT: ldr x19, [sp, #272]
-; CHECK-NEXT: ldp x11, x10, [sp, #312]
-; CHECK-NEXT: ldp x13, x12, [sp, #296]
-; CHECK-NEXT: ldp x18, x14, [sp, #280]
-; CHECK-NEXT: ldp x16, x17, [sp, #208]
-; CHECK-NEXT: ldp x21, x22, [sp, #352]
-; CHECK-NEXT: st1d { z3.d }, p0, [x20]
-; CHECK-NEXT: add x20, sp, #144
-; CHECK-NEXT: st1d { z2.d }, p0, [x20]
-; CHECK-NEXT: add x20, sp, #112
-; CHECK-NEXT: st1d { z1.d }, p0, [x20]
-; CHECK-NEXT: add x20, sp, #48
-; CHECK-NEXT: st1d { z0.d }, p0, [x20]
-; CHECK-NEXT: stp x21, x22, [sp, #352]
-; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: stp x19, x18, [sp, #272]
-; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: stp x16, x17, [sp, #208]
-; CHECK-NEXT: stur q4, [sp, #88]
-; CHECK-NEXT: str x15, [sp, #104]
-; CHECK-NEXT: stp x14, x13, [sp, #288]
-; CHECK-NEXT: stp x12, x11, [sp, #304]
-; CHECK-NEXT: stp x10, x9, [sp, #320]
-; CHECK-NEXT: str x8, [sp, #336]
-; CHECK-NEXT: ldr x29, [sp], #48 // 8-byte Folded Reload
-; CHECK-NEXT: b func2
+; CHECK-NEXT: add x8, x29, #104
+; CHECK-NEXT: add x9, x29, #288
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x8]
+; CHECK-NEXT: add x8, x29, #168
+; CHECK-NEXT: ld1d { z6.d }, p0/z, [x9]
+; CHECK-NEXT: ld1d { z4.d }, p0/z, [x8]
+; CHECK-NEXT: add x8, x29, #256
+; CHECK-NEXT: ldr x9, [x29, #320]
+; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8]
+; CHECK-NEXT: ldr x8, [x29, #200]
+; CHECK-NEXT: st1d { z6.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #28 // =0x1c
+; CHECK-NEXT: st1d { z5.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #17 // =0x11
+; CHECK-NEXT: st1d { z4.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #13 // =0xd
+; CHECK-NEXT: st1d { z3.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #9 // =0x9
+; CHECK-NEXT: st1d { z2.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #5 // =0x5
+; CHECK-NEXT: st1d { z1.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: stp x10, x11, [sp, #304]
+; CHECK-NEXT: str x9, [sp, #288]
+; CHECK-NEXT: str x8, [sp, #168]
+; CHECK-NEXT: bl func2
+; CHECK-NEXT: ldp x29, x30, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #352] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #368
+; CHECK-NEXT: ret
ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14, ptr %v15, ptr %v16,
ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr %v31, ptr %v32,
ptr %v33, ptr %v34, ptr %v35, ptr %v36, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr %v46, ptr %v47, ptr %v48,
i64 %v49) #0 {
- tail call void @func2(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
- ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr undef, ptr %v14, ptr %v15, ptr %v16,
- ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
- ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr undef, ptr undef,
- ptr undef, ptr undef, ptr undef, ptr undef, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
- ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr undef, ptr %v47, ptr %v48,
- i64 undef)
+ call void @func2(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
+ ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr undef, ptr %v14, ptr %v15, ptr %v16,
+ ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
+ ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr undef, ptr undef,
+ ptr undef, ptr undef, ptr undef, ptr undef, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
+ ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr undef, ptr %v47, ptr %v48,
+ i64 undef)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll b/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
new file mode 100644
index 0000000000000..bde5df737407a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
@@ -0,0 +1,55 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; Tail calls which have stack arguments in the same offsets as the caller do not
+; need to load and store the arguments from the stack.
+
+target triple = "aarch64-none-linux-gnu"
+
+declare i32 @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+
+; CHECK-LABEL: wrapper_func:
+define i32 @wrapper_func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+ ; CHECK: // %bb.
+ ; CHECK-NEXT: b func
+ %call = tail call i32 @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+ ret i32 %call
+}
+
+; CHECK-LABEL: wrapper_func_zero_arg:
+define i32 @wrapper_func_zero_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+ ; CHECK: // %bb.
+ ; CHECK-NEXT: mov w0, wzr
+ ; CHECK-NEXT: b func
+ %call = tail call i32 @func(i32 0, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+ ret i32 %call
+}
+
+; CHECK-LABEL: wrapper_func_zero_stack_arg:
+define i32 @wrapper_func_zero_stack_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+ ; CHECK: // %bb.
+ ; CHECK-NEXT: str wzr, [sp, #8]
+ ; CHECK-NEXT: b func
+ %call = tail call i32 @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 0)
+ ret i32 %call
+}
+
+; CHECK-LABEL: wrapper_func_overriden_arg:
+define i32 @wrapper_func_overriden_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+ ; CHECK: // %bb.
+ ; CHECK-NEXT: mov w1, w0
+ ; CHECK-NEXT: mov w0, wzr
+ ; CHECK-NEXT: b func
+ %call = tail call i32 @func(i32 0, i32 %a, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+ ret i32 %call
+}
+
+; CHECK-LABEL: wrapper_func_overriden_stack_arg:
+define i32 @wrapper_func_overriden_stack_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+ ; CHECK: // %bb.
+ ; CHECK-NEXT: ldr w8, [sp]
+ ; CHECK-NEXT: str wzr, [sp]
+ ; CHECK-NEXT: str w8, [sp, #8]
+ ; CHECK-NEXT: b func
+ %call = tail call i32 @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 0, i32 %i)
+ ret i32 %call
+}
|
dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) { | ||
MachineFrameInfo &MFI = MF.getFrameInfo(); | ||
int FI = FINode->getIndex(); | ||
if (LoadNode->getMemoryVT() == VA.getValVT() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you should add a test for i1
args, whose memory type is different than their in-registers type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, probably needs a couple tests for zeroext
/sext
args to make sure we do the right thing for smaller-than-32-bits args. In particular, we should make sure that when caller/callee don't agree on sign extension, that we still emit the caller-required part of that, and when they do agree we may omit it.
Do we need the equivalent optimization in GlobalISel (AArch64CallLowering.cpp)? |
b08450d
to
9a6844c
Compare
2e45964
to
c234686
Compare
We do, with |
c234686
to
e7d3666
Compare
fc34c7f
to
86b3cef
Compare
When possible, do not emit trivial load and stores to the same offset on the stack.
86b3cef
to
21aa89f
Compare
This issue starts in the selection DAG and causes the backend to emit the following for a trivial tail call:
I'm not too sure that checking for immutability of a specific stack object is a good enough of a gurantee, because as soon a tail-call is done lowering,
setHasTailCall()
is called and in that case perhaps a pass is allowed to change the value of the object in-memory?This can be extended to the ARM backend as well.
Removed the
tailcall
keyword from a few other test assets, I'm assuming their original intent was left intact.