-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[X86] Fix ABI for passing after i128 #124134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@@ -22,7 +22,7 @@ define i128 @on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5) { | |||
define i64 @trailing_arg_on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i64 %a6) { | |||
; CHECK-LABEL: trailing_arg_on_stack: | |||
; CHECK: # %bb.0: | |||
; CHECK-NEXT: movq 24(%rsp), %rax | |||
; CHECK-NEXT: movq %r9, %rax |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the most relevant test diff.
@llvm/pr-subscribers-backend-x86 Author: Nikita Popov (nikic) ChangesIf we're passing an i128 value and we no longer have enough argument registers (only r9 unallocated), the value gets passed via the stack. However, r9 is still allocated as a shadow register, which means that a following i64 argument will not use it. This doesn't match the x86-64 psABI. Fix this by making i128 arguments as requiring consecutive registers, and then adding a custom CC lowering that will allocate both parts of the i128 at the same time, either to register or to stack, without reserving a shadow register. Fixes #123935. Patch is 25.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124134.diff 14 Files Affected:
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 7359ef341dde58..74452685a0c465 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -340,5 +340,42 @@ static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return false;
}
+/// Special handling for i128: Either allocate the value to two consecutive
+/// i64 registers, or to the stack. Do not partially allocate in registers,
+/// and do not reserve any registers when allocating to the stack.
+static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ assert(ValVT == MVT::i64 && "Should have i64 parts");
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ unsigned NumRegs = PendingMembers.size();
+ assert(NumRegs == 2 && "SHould have two parts");
+
+ MCPhysReg Regs[] = {X86::RDI, X86::RSI, X86::RDX,
+ X86::RCX, X86::R8, X86::R9};
+ ArrayRef<MCPhysReg> Allocated = State.AllocateRegBlock(Regs, NumRegs);
+ if (!Allocated.empty()) {
+ for (const auto &[Pending, Reg] : zip(PendingMembers, Allocated)) {
+ Pending.convertToReg(Reg);
+ State.addLoc(Pending);
+ }
+ } else {
+ int64_t Offset = State.AllocateStack(8, Align(16));
+ for (auto &Pending : PendingMembers) {
+ Pending.convertToMem(Offset);
+ State.addLoc(Pending);
+ Offset += 8;
+ }
+ }
+ PendingMembers.clear();
+ return true;
+}
+
// Provides entry points of CC_X86 and RetCC_X86.
#include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 91af111db8cda5..72b103b0bb0c50 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -548,11 +548,9 @@ def CC_X86_64_C : CallingConv<[
CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
// i128 can be either passed in two i64 registers, or on the stack, but
- // not split across register and stack. As such, do not allow using R9
- // for a split i64.
+ // not split across register and stack. Handle this with a custom function.
CCIfType<[i64],
- CCIfSplit<CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
- CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [R9]>>>,
+ CCIfConsecutiveRegs<CCCustom<"CC_X86_64_I128">>>,
CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e07bcd989c5188..fe79fefeed631c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1604,6 +1604,10 @@ namespace llvm {
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const override;
+ bool functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const override;
+
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool supportSwiftError() const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 4a4fd246cb7cdf..6835c7e336a5cb 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -233,6 +233,14 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
return VT.changeVectorElementTypeToInteger();
}
+bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const {
+ // i128 split into i64 needs to be allocated to two consecutive registers,
+ // or spilled to the stack as a whole.
+ return Ty->isIntegerTy(128);
+}
+
/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index f8d32fc2d29252..97894db1188e28 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -49,7 +49,7 @@ define i256 @add256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: add256:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: addq %r9, %rsi
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8
diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
index deca130a04ff04..bd764c2edef29d 100644
--- a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
+++ b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
@@ -31,15 +31,15 @@ define <2 x i128> @flag_copy_2(<2 x i128> %x, <2 x i128> %y) nounwind {
; CHECK-NEXT: movq %r8, %rdi
; CHECK-NEXT: {nf} sarq $63, %rdi
; CHECK-NEXT: cmovoq %rdi, %rcx
-; CHECK-NEXT: movabsq $-9223372036854775808, %r9 # imm = 0x8000000000000000
-; CHECK-NEXT: {nf} xorq %r9, %rdi
+; CHECK-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
+; CHECK-NEXT: {nf} xorq %r10, %rdi
; CHECK-NEXT: cmovnoq %r8, %rdi
-; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: subq %r9, %rsi
; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: {nf} sarq $63, %r8
; CHECK-NEXT: cmovoq %r8, %rsi
-; CHECK-NEXT: {nf} xorq %r9, %r8
+; CHECK-NEXT: {nf} xorq %r10, %r8
; CHECK-NEXT: cmovnoq %rdx, %r8
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq %rsi, (%rax)
diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll
index da16a7da48ca69..11e886e25ba4ea 100644
--- a/llvm/test/CodeGen/X86/avgflooru-i128.ll
+++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll
@@ -119,7 +119,7 @@ define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
; CHECK-LABEL: avgflooru_i128_vec:
; CHECK: # %bb.0: # %start
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: addq %r9, %rsi
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT: setb %dil
; CHECK-NEXT: movzbl %dil, %edi
diff --git a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
index ccb2f37590b0ad..cbdfa32ed46274 100644
--- a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
+++ b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
@@ -1555,30 +1555,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
; SOFT-FLOAT-64-NEXT: .cfi_offset %r14, -32
; SOFT-FLOAT-64-NEXT: .cfi_offset %r15, -24
; SOFT-FLOAT-64-NEXT: .cfi_offset %rbp, -16
+; SOFT-FLOAT-64-NEXT: movq %r9, %rbp
; SOFT-FLOAT-64-NEXT: movq %rcx, %r14
; SOFT-FLOAT-64-NEXT: movq %rdx, %r15
-; SOFT-FLOAT-64-NEXT: movq %rsi, %r12
+; SOFT-FLOAT-64-NEXT: movq %rsi, %r13
; SOFT-FLOAT-64-NEXT: movq %rdi, %rbx
-; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: movq %r8, %rdi
; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT
-; SOFT-FLOAT-64-NEXT: movq %rax, %r13
+; SOFT-FLOAT-64-NEXT: movq %rax, %r12
; SOFT-FLOAT-64-NEXT: movq %r14, %rdi
-; SOFT-FLOAT-64-NEXT: movq %rbp, %rsi
+; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-NEXT: movq %rax, %r14
; SOFT-FLOAT-64-NEXT: movq %r15, %rdi
; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-NEXT: movq %rax, %r15
-; SOFT-FLOAT-64-NEXT: movq %r12, %rdi
-; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SOFT-FLOAT-64-NEXT: movq %r13, %rdi
+; SOFT-FLOAT-64-NEXT: movq %rbp, %rsi
; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-NEXT: movq %rax, %rdi
; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT
-; SOFT-FLOAT-64-NEXT: movq %rax, %r12
+; SOFT-FLOAT-64-NEXT: movq %rax, %r13
; SOFT-FLOAT-64-NEXT: movq %r15, %rdi
; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT
@@ -1587,13 +1587,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT
; SOFT-FLOAT-64-NEXT: movq %rax, %r14
-; SOFT-FLOAT-64-NEXT: movq %r13, %rdi
+; SOFT-FLOAT-64-NEXT: movq %r12, %rdi
; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT
; SOFT-FLOAT-64-NEXT: movq %rax, 24(%rbx)
; SOFT-FLOAT-64-NEXT: movq %r14, 16(%rbx)
; SOFT-FLOAT-64-NEXT: movq %r15, 8(%rbx)
-; SOFT-FLOAT-64-NEXT: movq %r12, (%rbx)
+; SOFT-FLOAT-64-NEXT: movq %r13, (%rbx)
; SOFT-FLOAT-64-NEXT: movq %rbx, %rax
; SOFT-FLOAT-64-NEXT: addq $8, %rsp
; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 56
@@ -1633,30 +1633,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r14, -32
; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r15, -24
; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbp, -16
+; SOFT-FLOAT-64-FMA-NEXT: movq %r9, %rbp
; SOFT-FLOAT-64-FMA-NEXT: movq %rcx, %r14
; SOFT-FLOAT-64-FMA-NEXT: movq %rdx, %r15
-; SOFT-FLOAT-64-FMA-NEXT: movq %rsi, %r12
+; SOFT-FLOAT-64-FMA-NEXT: movq %rsi, %r13
; SOFT-FLOAT-64-FMA-NEXT: movq %rdi, %rbx
-; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: movq %r8, %rdi
; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT
-; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r13
+; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r12
; SOFT-FLOAT-64-FMA-NEXT: movq %r14, %rdi
-; SOFT-FLOAT-64-FMA-NEXT: movq %rbp, %rsi
+; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r14
; SOFT-FLOAT-64-FMA-NEXT: movq %r15, %rdi
; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r15
-; SOFT-FLOAT-64-FMA-NEXT: movq %r12, %rdi
-; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SOFT-FLOAT-64-FMA-NEXT: movq %r13, %rdi
+; SOFT-FLOAT-64-FMA-NEXT: movq %rbp, %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %rdi
; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT
-; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r12
+; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r13
; SOFT-FLOAT-64-FMA-NEXT: movq %r15, %rdi
; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT
@@ -1665,13 +1665,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT
; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r14
-; SOFT-FLOAT-64-FMA-NEXT: movq %r13, %rdi
+; SOFT-FLOAT-64-FMA-NEXT: movq %r12, %rdi
; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT
; SOFT-FLOAT-64-FMA-NEXT: movq %rax, 24(%rbx)
; SOFT-FLOAT-64-FMA-NEXT: movq %r14, 16(%rbx)
; SOFT-FLOAT-64-FMA-NEXT: movq %r15, 8(%rbx)
-; SOFT-FLOAT-64-FMA-NEXT: movq %r12, (%rbx)
+; SOFT-FLOAT-64-FMA-NEXT: movq %r13, (%rbx)
; SOFT-FLOAT-64-FMA-NEXT: movq %rbx, %rax
; SOFT-FLOAT-64-FMA-NEXT: addq $8, %rsp
; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 56
@@ -1711,30 +1711,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r14, -32
; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r15, -24
; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbp, -16
+; SOFT-FLOAT-64-FMA4-NEXT: movq %r9, %rbp
; SOFT-FLOAT-64-FMA4-NEXT: movq %rcx, %r14
; SOFT-FLOAT-64-FMA4-NEXT: movq %rdx, %r15
-; SOFT-FLOAT-64-FMA4-NEXT: movq %rsi, %r12
+; SOFT-FLOAT-64-FMA4-NEXT: movq %rsi, %r13
; SOFT-FLOAT-64-FMA4-NEXT: movq %rdi, %rbx
-; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: movq %r8, %rdi
; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT
-; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r13
+; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r12
; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, %rdi
-; SOFT-FLOAT-64-FMA4-NEXT: movq %rbp, %rsi
+; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r14
; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, %rdi
; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r15
-; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, %rdi
-; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, %rdi
+; SOFT-FLOAT-64-FMA4-NEXT: movq %rbp, %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT
; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %rdi
; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT
-; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r12
+; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r13
; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, %rdi
; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT
@@ -1743,13 +1743,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT
; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r14
-; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, %rdi
+; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, %rdi
; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT
; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, 24(%rbx)
; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, 16(%rbx)
; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, 8(%rbx)
-; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, (%rbx)
+; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, (%rbx)
; SOFT-FLOAT-64-FMA4-NEXT: movq %rbx, %rax
; SOFT-FLOAT-64-FMA4-NEXT: addq $8, %rsp
; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 56
diff --git a/llvm/test/CodeGen/X86/i128-abi.ll b/llvm/test/CodeGen/X86/i128-abi.ll
index d1d6f86e08fb8b..1a635e703c0593 100644
--- a/llvm/test/CodeGen/X86/i128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-abi.ll
@@ -22,7 +22,7 @@ define i128 @on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5) {
define i64 @trailing_arg_on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i64 %a6) {
; CHECK-LABEL: trailing_arg_on_stack:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq 24(%rsp), %rax
+; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: retq
ret i64 %a6
}
@@ -69,20 +69,18 @@ define void @call_trailing_arg_on_stack(i128 %x, i64 %y) nounwind {
; CHECK-LABEL: call_trailing_arg_on_stack:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: movq %rsi, %r9
+; CHECK-NEXT: movq %rdx, %r9
+; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: movq %rdi, %r10
-; CHECK-NEXT: subq $8, %rsp
; CHECK-NEXT: movl $1, %esi
; CHECK-NEXT: movl $2, %edx
; CHECK-NEXT: movl $3, %ecx
; CHECK-NEXT: movl $4, %r8d
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: pushq %r9
; CHECK-NEXT: pushq %r10
; CHECK-NEXT: callq trailing_arg_on_stack@PLT
-; CHECK-NEXT: addq $32, %rsp
+; CHECK-NEXT: addq $16, %rsp
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
call i128 @trailing_arg_on_stack(i64 0, i64 1, i64 2, i64 3, i64 4, i128 %x, i64 %y)
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 322acd76e12e63..bd563f97b0ac4b 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1795,27 +1795,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; SSE-NEXT: seto %dil
-; SSE-NEXT: movq %r8, %r9
-; SSE-NEXT: sarq $63, %r9
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: sarq $63, %r10
; SSE-NEXT: testb %dil, %dil
-; SSE-NEXT: cmovneq %r9, %rcx
-; SSE-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; SSE-NEXT: xorq %r10, %r9
+; SSE-NEXT: cmovneq %r10, %rcx
+; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT: xorq %r11, %r10
; SSE-NEXT: testb %dil, %dil
-; SSE-NEXT: cmoveq %r8, %r9
-; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: cmoveq %r8, %r10
+; SSE-NEXT: addq %r9, %rsi
; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; SSE-NEXT: seto %dil
; SSE-NEXT: movq %rdx, %r8
; SSE-NEXT: sarq $63, %r8
; SSE-NEXT: testb %dil, %dil
; SSE-NEXT: cmovneq %r8, %rsi
-; SSE-NEXT: xorq %r10, %r8
+; SSE-NEXT: xorq %r11, %r8
; SSE-NEXT: testb %dil, %dil
; SSE-NEXT: cmoveq %rdx, %r8
; SSE-NEXT: movq %rcx, 16(%rax)
; SSE-NEXT: movq %rsi, (%rax)
-; SSE-NEXT: movq %r9, 24(%rax)
+; SSE-NEXT: movq %r10, 24(%rax)
; SSE-NEXT: movq %r8, 8(%rax)
; SSE-NEXT: retq
;
@@ -1825,27 +1825,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx
; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX-NEXT: seto %dil
-; AVX-NEXT: movq %r8, %r9
-; AVX-NEXT: sarq $63, %r9
+; AVX-NEXT: movq %r8, %r10
+; AVX-NEXT: sarq $63, %r10
; AVX-NEXT: testb %dil, %dil
-; AVX-NEXT: cmovneq %r9, %rcx
-; AVX-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; AVX-NEXT: xorq %r10, %r9
+; AVX-NEXT: cmovneq %r10, %rcx
+; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT: xorq %r11, %r10
; AVX-NEXT: testb %dil, %dil
-; AVX-NEXT: cmoveq %r8, %r9
-; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: cmoveq %r8, %r10
+; AVX-NEXT: addq %r9, %rsi
; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: seto %dil
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: sarq $63, %r8
; AVX-NEXT: testb %dil, %dil
; AVX-NEXT: cmovneq %r8, %rsi
-; AVX-NEXT: xorq %r10, %r8
+; AVX-NEXT: xorq %r11, %r8
; AVX-NEXT: testb %dil, %dil
; AVX-NEXT: cmoveq %rdx, %r8
; AVX-NEXT: movq %rcx, 16(%rax)
; AVX-NEXT: movq %rsi, (%rax)
-; AVX-NEXT: movq %r9, 24(%rax)
+; AVX-NEXT: movq %r10, 24(%rax)
; AVX-NEXT: movq %r8, 8(%rax)
; AVX-NEXT: retq
%z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index ac8b561abf0033..88df3c175ec9cc 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -2026,27 +2026,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; SSE-NEXT: seto %dil
-; SSE-NEXT: movq %r8, %r9
-; SSE-NEXT: sarq $63, %r9
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: sarq $63, %r10
; SSE-NEXT: testb %dil, %dil
-; SSE-NEXT: cmovneq %r9, %rcx
-; SSE-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; SSE-NEXT: xorq %r10, %r9
+; SSE-NEXT: cmovneq %r10, %rcx
+; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT: xorq %r11, %r10
; SSE-NEXT: testb %dil, %dil
-; SSE-NEXT: cmoveq %r8, %r9
-; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rsi
+; SS...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
abcff55
to
74b8d74
Compare
Probably worth noting that it's expected that the frontend will directly pass via stack for things like |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tentative LGTM, except for allocate stack. I'm not too familiar with the calling conv code, so I'd prefer if someone else also acks this.
State.addLoc(Pending); | ||
} | ||
} else { | ||
int64_t Offset = State.AllocateStack(8, Align(16)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should size be 16 here? I don't think we have a test like this:
define i128 @on_stack2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i128 %a6) {
ret i128 %a6
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this was supposed to be 16, thanks! (Leftover from previous implementation where I allocated them separately.)
I also added that test in bca6dbd, but it wasn't actually affected by this issue because we'd still end up with the correct offsets due to alignment.
Yeah, |
If we're passing an i128 value and we no longer have enough argument registers (only r9 unallocated), the value gets passed via the stack. However, r9 is still allocated as a shadow register, which means that a following i64 argument will not use it. This doesn't match the x86-64 psABI. Fix this by making i128 arguments as requiring consecutive registers, and then adding a custom CC lowering that will allocate both parts of the i128 at the same time, either to register or to stack, without reserving a shadow register. Fixes llvm#123935.
; CHECK-NEXT: cmovnoq %r8, %rdi | ||
; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rsi | ||
; CHECK-NEXT: subq %r9, %rsi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this correct? The fourth i128
seems be split to R9 and stack.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, so you mean either this or four single i128
is not legal argument, because even for the later, the fourth i128
should be turn into memory by FE. In this way, I think we may not need to handle the problem here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I'm assuming that the frontend will handle <2 x i128>
by directly using byval (https://clang.godbolt.org/z/bznzTKohz -- interestingly clang still directly returns the vector, I would have expected it to use sret, rather than relying on sret demotion in the backend...)
I could extend this code to also handle vectors of i128 and require the whole vector argument to be in consecutive arguments. I'm just not sure it makes sense to handle this, as there is no defined psABI for <2 x i128> in the first place, and if there were, the frontend would handle that part.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Returning is another story, we emit warnings sometimes https://clang.godbolt.org/z/fYo937x3K
I think we can leave it as it.
@@ -1555,30 +1555,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 | |||
; SOFT-FLOAT-64-NEXT: .cfi_offset %r14, -32 | |||
; SOFT-FLOAT-64-NEXT: .cfi_offset %r15, -24 | |||
; SOFT-FLOAT-64-NEXT: .cfi_offset %rbp, -16 | |||
; SOFT-FLOAT-64-NEXT: movq %r9, %rbp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's no i128
but <4 x double>
. This test should not be affected, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, soft-float
, another case where we can ignore the ABI..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right. The previous implementation also (unintentionally) affected cases like an illegal <4 x i64>
vector (or <4 x double>
for softfloat) and wouldn't allow the first element of the vector to use r9. So the whole value could still be split across regs and stacks, just not between the 1st and 2nd element...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why it doesn't trigger assert(NumRegs == 2 && "Should have two parts");
? I assume there will be 4 consecutive i64 to handle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because functionArgumentNeedsConsecutiveRegisters only returns true for i128, so <4 x double> gets the default behavior.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, thanks!
@@ -78,20 +78,18 @@ define void @call_trailing_arg_on_stack(i128 %x, i64 %y) nounwind { | |||
; CHECK-LABEL: call_trailing_arg_on_stack: | |||
; CHECK: # %bb.0: | |||
; CHECK-NEXT: pushq %rax | |||
; CHECK-NEXT: movq %rdx, %rax | |||
; CHECK-NEXT: movq %rsi, %r9 | |||
; CHECK-NEXT: movq %rdx, %r9 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is relevant too. The %y
is passing through R9 now for trailing_arg_on_stack
. (So the function name is not correct anymore :)
int64_t Offset = State.AllocateStack(16, Align(16)); | ||
for (auto &Pending : PendingMembers) { | ||
Pending.convertToMem(Offset); | ||
State.addLoc(Pending); | ||
Offset += 8; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not shown in test case, but I have a concern: would we get two discontinuous i64 due to the alignment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I saw the pre-commited case now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
} | ||
} else { | ||
int64_t Offset = State.AllocateStack(16, Align(16)); | ||
for (auto &Pending : PendingMembers) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: maybe better to use PendingMembers[0/1]
directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. I kept this a bit more generic originally in case we want to handle cases like <2 x i128> in the future, but for now we don't need the loops...
Pending.convertToReg(Reg); | ||
State.addLoc(Pending); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PendingMembers[0/1] and Allocated[0/1]?
unsigned NumRegs = PendingMembers.size(); | ||
assert(NumRegs == 2 && "Should have two parts"); | ||
|
||
MCPhysReg Regs[] = {X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static const?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
If we're passing an i128 value and we no longer have enough argument registers (only r9 unallocated), the value gets passed via the stack. However, r9 is still allocated as a shadow register, which means that a following i64 argument will not use it. This doesn't match the x86-64 psABI.
Fix this by making i128 arguments as requiring consecutive registers, and then adding a custom CC lowering that will allocate both parts of the i128 at the same time, either to register or to stack, without reserving a shadow register.
Fixes #123935.