2016-08-29 19:07:08 +00:00
|
|
|
//===-- lib/CodeGen/GlobalISel/CallLowering.cpp - Call lowering -----------===//
|
|
|
|
//
|
2019-01-19 08:50:56 +00:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2016-08-29 19:07:08 +00:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
///
|
|
|
|
/// \file
|
|
|
|
/// This file implements some simple delegations needed for call lowering.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2019-06-27 08:54:17 +00:00
|
|
|
#include "llvm/CodeGen/Analysis.h"
|
2019-09-03 21:42:28 +00:00
|
|
|
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
|
|
|
|
#include "llvm/CodeGen/GlobalISel/Utils.h"
|
2016-12-05 10:40:33 +00:00
|
|
|
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
|
2016-08-29 19:07:08 +00:00
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
2016-12-13 10:46:12 +00:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2017-11-17 01:07:10 +00:00
|
|
|
#include "llvm/CodeGen/TargetLowering.h"
|
2016-09-21 12:57:45 +00:00
|
|
|
#include "llvm/IR/DataLayout.h"
|
2016-12-05 10:40:33 +00:00
|
|
|
#include "llvm/IR/Instructions.h"
|
[GISel] Pass MD_callees metadata down in call lowering.
Summary:
This will make it possible to improve IPRA by taking into account
register usage in indirect calls.
NFC yet; this is just laying the groundwork to start building
up patches to take advantage of the information for improved register
allocation.
Reviewers: aditya_nandakumar, volkan, qcolombet, arsenm, rovka, aemerson, paquette
Subscribers: sdardis, wdng, javed.absar, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65488
llvm-svn: 367476
2019-07-31 20:34:02 +00:00
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2016-09-21 12:57:45 +00:00
|
|
|
#include "llvm/IR/Module.h"
|
2020-05-23 19:49:38 +01:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
2016-08-29 19:07:08 +00:00
|
|
|
|
2019-04-09 21:22:33 +00:00
|
|
|
#define DEBUG_TYPE "call-lowering"
|
|
|
|
|
2016-08-29 19:07:08 +00:00
|
|
|
using namespace llvm;
|
|
|
|
|
2018-12-29 02:02:13 +00:00
|
|
|
void CallLowering::anchor() {}
|
|
|
|
|
2020-08-18 10:37:10 -07:00
|
|
|
/// Helper function which updates \p Flags when \p AttrFn returns true.
|
|
|
|
static void
|
|
|
|
addFlagsUsingAttrFn(ISD::ArgFlagsTy &Flags,
|
|
|
|
const std::function<bool(Attribute::AttrKind)> &AttrFn) {
|
|
|
|
if (AttrFn(Attribute::SExt))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setSExt();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::ZExt))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setZExt();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::InReg))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setInReg();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::StructRet))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setSRet();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::Nest))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setNest();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::ByVal))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setByVal();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::Preallocated))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setPreallocated();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::InAlloca))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setInAlloca();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::Returned))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setReturned();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::SwiftSelf))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setSwiftSelf();
|
2021-01-20 10:14:03 +00:00
|
|
|
if (AttrFn(Attribute::SwiftAsync))
|
|
|
|
Flags.setSwiftAsync();
|
2020-08-18 10:37:10 -07:00
|
|
|
if (AttrFn(Attribute::SwiftError))
|
2020-08-17 16:42:28 -07:00
|
|
|
Flags.setSwiftError();
|
2020-08-18 10:37:10 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
ISD::ArgFlagsTy CallLowering::getAttributesForArgIdx(const CallBase &Call,
|
|
|
|
unsigned ArgIdx) const {
|
|
|
|
ISD::ArgFlagsTy Flags;
|
|
|
|
addFlagsUsingAttrFn(Flags, [&Call, &ArgIdx](Attribute::AttrKind Attr) {
|
|
|
|
return Call.paramHasAttr(ArgIdx, Attr);
|
|
|
|
});
|
2020-08-17 16:42:28 -07:00
|
|
|
return Flags;
|
|
|
|
}
|
|
|
|
|
2020-08-18 10:37:10 -07:00
|
|
|
void CallLowering::addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
|
|
|
|
const AttributeList &Attrs,
|
|
|
|
unsigned OpIdx) const {
|
|
|
|
addFlagsUsingAttrFn(Flags, [&Attrs, &OpIdx](Attribute::AttrKind Attr) {
|
|
|
|
return Attrs.hasAttribute(OpIdx, Attr);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2020-04-13 10:17:29 -07:00
|
|
|
bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
|
2019-06-27 09:15:53 +00:00
|
|
|
ArrayRef<Register> ResRegs,
|
2019-06-27 09:18:03 +00:00
|
|
|
ArrayRef<ArrayRef<Register>> ArgRegs,
|
2019-06-24 15:50:29 +00:00
|
|
|
Register SwiftErrorVReg,
|
2019-05-24 08:40:13 +00:00
|
|
|
std::function<unsigned()> GetCalleeReg) const {
|
2019-08-09 08:26:38 +00:00
|
|
|
CallLoweringInfo Info;
|
2020-02-24 14:08:41 -05:00
|
|
|
const DataLayout &DL = MIRBuilder.getDataLayout();
|
2020-08-18 09:23:48 -07:00
|
|
|
MachineFunction &MF = MIRBuilder.getMF();
|
|
|
|
bool CanBeTailCalled = CB.isTailCall() &&
|
|
|
|
isInTailCallPosition(CB, MF.getTarget()) &&
|
|
|
|
(MF.getFunction()
|
|
|
|
.getFnAttribute("disable-tail-calls")
|
|
|
|
.getValueAsString() != "true");
|
2016-09-21 12:57:45 +00:00
|
|
|
|
2020-12-23 12:22:36 +05:30
|
|
|
CallingConv::ID CallConv = CB.getCallingConv();
|
|
|
|
Type *RetTy = CB.getType();
|
|
|
|
bool IsVarArg = CB.getFunctionType()->isVarArg();
|
|
|
|
|
|
|
|
SmallVector<BaseArgInfo, 4> SplitArgs;
|
|
|
|
getReturnInfo(CallConv, RetTy, CB.getAttributes(), SplitArgs, DL);
|
2021-01-06 11:22:23 +05:30
|
|
|
Info.CanLowerReturn = canLowerReturn(MF, CallConv, SplitArgs, IsVarArg);
|
2020-12-23 12:22:36 +05:30
|
|
|
|
|
|
|
if (!Info.CanLowerReturn) {
|
|
|
|
// Callee requires sret demotion.
|
|
|
|
insertSRetOutgoingArgument(MIRBuilder, CB, Info);
|
|
|
|
|
|
|
|
// The sret demotion isn't compatible with tail-calls, since the sret
|
|
|
|
// argument points into the caller's stack frame.
|
|
|
|
CanBeTailCalled = false;
|
|
|
|
}
|
|
|
|
|
2016-08-29 19:07:08 +00:00
|
|
|
// First step is to marshall all the function's parameters into the correct
|
|
|
|
// physregs and memory locations. Gather the sequence of argument types that
|
|
|
|
// we'll pass to the assigner function.
|
2016-09-21 12:57:45 +00:00
|
|
|
unsigned i = 0;
|
2020-04-13 10:17:29 -07:00
|
|
|
unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
|
|
|
|
for (auto &Arg : CB.args()) {
|
2021-07-08 11:26:30 -04:00
|
|
|
ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i),
|
2017-01-17 22:30:10 +00:00
|
|
|
i < NumFixedArgs};
|
2020-04-13 10:17:29 -07:00
|
|
|
setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
|
2020-08-18 09:23:48 -07:00
|
|
|
|
|
|
|
// If we have an explicit sret argument that is an Instruction, (i.e., it
|
|
|
|
// might point to function-local memory), we can't meaningfully tail-call.
|
|
|
|
if (OrigArg.Flags[0].isSRet() && isa<Instruction>(&Arg))
|
|
|
|
CanBeTailCalled = false;
|
|
|
|
|
2019-08-09 08:26:38 +00:00
|
|
|
Info.OrigArgs.push_back(OrigArg);
|
2016-09-21 12:57:45 +00:00
|
|
|
++i;
|
|
|
|
}
|
2016-08-29 19:07:08 +00:00
|
|
|
|
2020-02-10 15:41:53 -08:00
|
|
|
// Try looking through a bitcast from one function type to another.
|
|
|
|
// Commonly happens with calls to objc_msgSend().
|
2020-04-27 20:15:59 -07:00
|
|
|
const Value *CalleeV = CB.getCalledOperand()->stripPointerCasts();
|
2020-02-10 15:41:53 -08:00
|
|
|
if (const Function *F = dyn_cast<Function>(CalleeV))
|
2019-08-09 08:26:38 +00:00
|
|
|
Info.Callee = MachineOperand::CreateGA(F, 0);
|
2020-02-10 15:41:53 -08:00
|
|
|
else
|
|
|
|
Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
|
2016-08-29 19:07:08 +00:00
|
|
|
|
2021-07-08 11:26:30 -04:00
|
|
|
Info.OrigRet = ArgInfo{ResRegs, RetTy, 0, ISD::ArgFlagsTy{}};
|
2019-08-09 08:26:38 +00:00
|
|
|
if (!Info.OrigRet.Ty->isVoidTy())
|
2020-04-13 10:17:29 -07:00
|
|
|
setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
|
2016-09-21 12:57:45 +00:00
|
|
|
|
2020-04-13 10:17:29 -07:00
|
|
|
Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
|
2020-12-23 12:22:36 +05:30
|
|
|
Info.CallConv = CallConv;
|
2019-08-09 08:26:38 +00:00
|
|
|
Info.SwiftErrorVReg = SwiftErrorVReg;
|
2020-04-13 10:17:29 -07:00
|
|
|
Info.IsMustTailCall = CB.isMustTailCall();
|
2020-08-18 09:23:48 -07:00
|
|
|
Info.IsTailCall = CanBeTailCalled;
|
2020-12-23 12:22:36 +05:30
|
|
|
Info.IsVarArg = IsVarArg;
|
2019-08-09 08:26:38 +00:00
|
|
|
return lowerCall(MIRBuilder, Info);
|
2016-08-29 19:07:08 +00:00
|
|
|
}
|
2016-09-21 12:57:45 +00:00
|
|
|
|
|
|
|
template <typename FuncInfoTy>
|
|
|
|
void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
|
|
|
|
const DataLayout &DL,
|
|
|
|
const FuncInfoTy &FuncInfo) const {
|
2019-09-03 21:42:28 +00:00
|
|
|
auto &Flags = Arg.Flags[0];
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-21 16:57:19 +00:00
|
|
|
const AttributeList &Attrs = FuncInfo.getAttributes();
|
2020-08-18 10:37:10 -07:00
|
|
|
addArgFlagsFromAttributes(Flags, Attrs, OpIdx);
|
2016-09-21 12:57:45 +00:00
|
|
|
|
2021-07-14 14:03:18 -04:00
|
|
|
PointerType *PtrTy = dyn_cast<PointerType>(Arg.Ty->getScalarType());
|
|
|
|
if (PtrTy) {
|
|
|
|
Flags.setPointer();
|
|
|
|
Flags.setPointerAddrSpace(PtrTy->getPointerAddressSpace());
|
|
|
|
}
|
|
|
|
|
2021-05-10 11:19:13 +01:00
|
|
|
Align MemAlign = DL.getABITypeAlign(Arg.Ty);
|
Reland [X86] Codegen for preallocated
See https://reviews.llvm.org/D74651 for the preallocated IR constructs
and LangRef changes.
In X86TargetLowering::LowerCall(), if a call is preallocated, record
each argument's offset from the stack pointer and the total stack
adjustment. Associate the call Value with an integer index. Store the
info in X86MachineFunctionInfo with the integer index as the key.
This adds two new target independent ISDOpcodes and two new target
dependent Opcodes corresponding to @llvm.call.preallocated.{setup,arg}.
The setup ISelDAG node takes in a chain and outputs a chain and a
SrcValue of the preallocated call Value. It is lowered to a target
dependent node with the SrcValue replaced with the integer index key by
looking in X86MachineFunctionInfo. In
X86TargetLowering::EmitInstrWithCustomInserter() this is lowered to an
%esp adjustment, the exact amount determined by looking in
X86MachineFunctionInfo with the integer index key.
The arg ISelDAG node takes in a chain, a SrcValue of the preallocated
call Value, and the arg index int constant. It produces a chain and the
pointer fo the arg. It is lowered to a target dependent node with the
SrcValue replaced with the integer index key by looking in
X86MachineFunctionInfo. In
X86TargetLowering::EmitInstrWithCustomInserter() this is lowered to a
lea of the stack pointer plus an offset determined by looking in
X86MachineFunctionInfo with the integer index key.
Force any function containing a preallocated call to use the frame
pointer.
Does not yet handle a setup without a call, or a conditional call.
Does not yet handle musttail. That requires a LangRef change first.
Tried to look at all references to inalloca and see if they apply to
preallocated. I've made preallocated versions of tests testing inalloca
whenever possible and when they make sense (e.g. not alloca related,
inalloca edge cases).
Aside from the tests added here, I checked that this codegen produces
correct code for something like
```
struct A {
A();
A(A&&);
~A();
};
void bar() {
foo(foo(foo(foo(foo(A(), 4), 5), 6), 7), 8);
}
```
by replacing the inalloca version of the .ll file with the appropriate
preallocated code. Running the executable produces the same results as
using the current inalloca implementation.
Reverted due to unexpectedly passing tests, added REQUIRES: asserts for reland.
Subscribers: hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D77689
2020-03-16 12:32:36 -07:00
|
|
|
if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
|
2021-05-10 11:19:13 +01:00
|
|
|
assert(OpIdx >= AttributeList::FirstArgIndex);
|
2021-07-14 14:03:18 -04:00
|
|
|
Type *ElementTy = PtrTy->getElementType();
|
2019-05-30 18:48:23 +00:00
|
|
|
|
|
|
|
auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
|
2019-09-03 21:42:28 +00:00
|
|
|
Flags.setByValSize(DL.getTypeAllocSize(Ty ? Ty : ElementTy));
|
2019-05-30 18:48:23 +00:00
|
|
|
|
2016-09-21 12:57:45 +00:00
|
|
|
// For ByVal, alignment should be passed from FE. BE will guess if
|
|
|
|
// this info is not there but there are cases it cannot get right.
|
2021-05-10 11:19:13 +01:00
|
|
|
if (auto ParamAlign =
|
|
|
|
FuncInfo.getParamStackAlign(OpIdx - AttributeList::FirstArgIndex))
|
[clang][AArch64] Correctly align HFA arguments when passed on the stack
When we pass a AArch64 Homogeneous Floating-Point
Aggregate (HFA) argument with increased alignment
requirements, for example
struct S {
__attribute__ ((__aligned__(16))) double v[4];
};
Clang uses `[4 x double]` for the parameter, which is passed
on the stack at alignment 8, whereas it should be at
alignment 16, following Rule C.4 in
AAPCS (https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#642parameter-passing-rules)
Currently we don't have a way to express in LLVM IR the
alignment requirements of the function arguments. The align
attribute is applicable to pointers only, and only for some
special ways of passing arguments (e..g byval). When
implementing AAPCS32/AAPCS64, clang resorts to dubious hacks
of coercing to types, which naturally have the needed
alignment. We don't have enough types to cover all the
cases, though.
This patch introduces a new use of the stackalign attribute
to control stack slot alignment, when and if an argument is
passed in memory.
The attribute align is left as an optimizer hint - it still
applies to pointer types only and pertains to the content of
the pointer, whereas the alignment of the pointer itself is
determined by the stackalign attribute.
For byval arguments, the stackalign attribute assumes the
role, previously perfomed by align, falling back to align if
stackalign` is absent.
On the clang side, when passing arguments using the "direct"
style (cf. `ABIArgInfo::Kind`), now we can optionally
specify an alignment, which is emitted as the new
`stackalign` attribute.
Patch by Momchil Velikov and Lucas Prates.
Differential Revision: https://reviews.llvm.org/D98794
2021-04-15 19:58:54 +01:00
|
|
|
MemAlign = *ParamAlign;
|
2021-05-10 11:19:13 +01:00
|
|
|
else if ((ParamAlign =
|
|
|
|
FuncInfo.getParamAlign(OpIdx - AttributeList::FirstArgIndex)))
|
[clang][AArch64] Correctly align HFA arguments when passed on the stack
When we pass a AArch64 Homogeneous Floating-Point
Aggregate (HFA) argument with increased alignment
requirements, for example
struct S {
__attribute__ ((__aligned__(16))) double v[4];
};
Clang uses `[4 x double]` for the parameter, which is passed
on the stack at alignment 8, whereas it should be at
alignment 16, following Rule C.4 in
AAPCS (https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#642parameter-passing-rules)
Currently we don't have a way to express in LLVM IR the
alignment requirements of the function arguments. The align
attribute is applicable to pointers only, and only for some
special ways of passing arguments (e..g byval). When
implementing AAPCS32/AAPCS64, clang resorts to dubious hacks
of coercing to types, which naturally have the needed
alignment. We don't have enough types to cover all the
cases, though.
This patch introduces a new use of the stackalign attribute
to control stack slot alignment, when and if an argument is
passed in memory.
The attribute align is left as an optimizer hint - it still
applies to pointer types only and pertains to the content of
the pointer, whereas the alignment of the pointer itself is
determined by the stackalign attribute.
For byval arguments, the stackalign attribute assumes the
role, previously perfomed by align, falling back to align if
stackalign` is absent.
On the clang side, when passing arguments using the "direct"
style (cf. `ABIArgInfo::Kind`), now we can optionally
specify an alignment, which is emitted as the new
`stackalign` attribute.
Patch by Momchil Velikov and Lucas Prates.
Differential Revision: https://reviews.llvm.org/D98794
2021-04-15 19:58:54 +01:00
|
|
|
MemAlign = *ParamAlign;
|
2016-09-21 12:57:45 +00:00
|
|
|
else
|
[clang][AArch64] Correctly align HFA arguments when passed on the stack
When we pass a AArch64 Homogeneous Floating-Point
Aggregate (HFA) argument with increased alignment
requirements, for example
struct S {
__attribute__ ((__aligned__(16))) double v[4];
};
Clang uses `[4 x double]` for the parameter, which is passed
on the stack at alignment 8, whereas it should be at
alignment 16, following Rule C.4 in
AAPCS (https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#642parameter-passing-rules)
Currently we don't have a way to express in LLVM IR the
alignment requirements of the function arguments. The align
attribute is applicable to pointers only, and only for some
special ways of passing arguments (e..g byval). When
implementing AAPCS32/AAPCS64, clang resorts to dubious hacks
of coercing to types, which naturally have the needed
alignment. We don't have enough types to cover all the
cases, though.
This patch introduces a new use of the stackalign attribute
to control stack slot alignment, when and if an argument is
passed in memory.
The attribute align is left as an optimizer hint - it still
applies to pointer types only and pertains to the content of
the pointer, whereas the alignment of the pointer itself is
determined by the stackalign attribute.
For byval arguments, the stackalign attribute assumes the
role, previously perfomed by align, falling back to align if
stackalign` is absent.
On the clang side, when passing arguments using the "direct"
style (cf. `ABIArgInfo::Kind`), now we can optionally
specify an alignment, which is emitted as the new
`stackalign` attribute.
Patch by Momchil Velikov and Lucas Prates.
Differential Revision: https://reviews.llvm.org/D98794
2021-04-15 19:58:54 +01:00
|
|
|
MemAlign = Align(getTLI()->getByValTypeAlignment(ElementTy, DL));
|
2021-05-10 11:19:13 +01:00
|
|
|
} else if (OpIdx >= AttributeList::FirstArgIndex) {
|
|
|
|
if (auto ParamAlign =
|
|
|
|
FuncInfo.getParamStackAlign(OpIdx - AttributeList::FirstArgIndex))
|
|
|
|
MemAlign = *ParamAlign;
|
2016-09-21 12:57:45 +00:00
|
|
|
}
|
[clang][AArch64] Correctly align HFA arguments when passed on the stack
When we pass a AArch64 Homogeneous Floating-Point
Aggregate (HFA) argument with increased alignment
requirements, for example
struct S {
__attribute__ ((__aligned__(16))) double v[4];
};
Clang uses `[4 x double]` for the parameter, which is passed
on the stack at alignment 8, whereas it should be at
alignment 16, following Rule C.4 in
AAPCS (https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#642parameter-passing-rules)
Currently we don't have a way to express in LLVM IR the
alignment requirements of the function arguments. The align
attribute is applicable to pointers only, and only for some
special ways of passing arguments (e..g byval). When
implementing AAPCS32/AAPCS64, clang resorts to dubious hacks
of coercing to types, which naturally have the needed
alignment. We don't have enough types to cover all the
cases, though.
This patch introduces a new use of the stackalign attribute
to control stack slot alignment, when and if an argument is
passed in memory.
The attribute align is left as an optimizer hint - it still
applies to pointer types only and pertains to the content of
the pointer, whereas the alignment of the pointer itself is
determined by the stackalign attribute.
For byval arguments, the stackalign attribute assumes the
role, previously perfomed by align, falling back to align if
stackalign` is absent.
On the clang side, when passing arguments using the "direct"
style (cf. `ABIArgInfo::Kind`), now we can optionally
specify an alignment, which is emitted as the new
`stackalign` attribute.
Patch by Momchil Velikov and Lucas Prates.
Differential Revision: https://reviews.llvm.org/D98794
2021-04-15 19:58:54 +01:00
|
|
|
Flags.setMemAlign(MemAlign);
|
2020-07-01 14:31:56 +00:00
|
|
|
Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
|
2021-02-03 10:33:48 -08:00
|
|
|
|
|
|
|
// Don't try to use the returned attribute if the argument is marked as
|
|
|
|
// swiftself, since it won't be passed in x0.
|
|
|
|
if (Flags.isSwiftSelf())
|
|
|
|
Flags.setReturned(false);
|
2016-09-21 12:57:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template void
|
|
|
|
CallLowering::setArgFlags<Function>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
|
|
|
|
const DataLayout &DL,
|
|
|
|
const Function &FuncInfo) const;
|
|
|
|
|
|
|
|
template void
|
2020-04-13 10:17:29 -07:00
|
|
|
CallLowering::setArgFlags<CallBase>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
|
2016-09-21 12:57:45 +00:00
|
|
|
const DataLayout &DL,
|
2020-04-13 10:17:29 -07:00
|
|
|
const CallBase &FuncInfo) const;
|
2016-12-05 10:40:33 +00:00
|
|
|
|
2021-02-28 11:12:08 -05:00
|
|
|
void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
|
|
|
|
SmallVectorImpl<ArgInfo> &SplitArgs,
|
|
|
|
const DataLayout &DL,
|
2021-07-15 12:29:50 -04:00
|
|
|
CallingConv::ID CallConv,
|
|
|
|
SmallVectorImpl<uint64_t> *Offsets) const {
|
2021-02-28 11:12:08 -05:00
|
|
|
LLVMContext &Ctx = OrigArg.Ty->getContext();
|
|
|
|
|
|
|
|
SmallVector<EVT, 4> SplitVTs;
|
2021-07-15 12:29:50 -04:00
|
|
|
ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, Offsets, 0);
|
2021-02-28 11:12:08 -05:00
|
|
|
|
|
|
|
if (SplitVTs.size() == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (SplitVTs.size() == 1) {
|
|
|
|
// No splitting to do, but we want to replace the original type (e.g. [1 x
|
|
|
|
// double] -> double).
|
|
|
|
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
|
2021-07-08 11:26:30 -04:00
|
|
|
OrigArg.OrigArgIndex, OrigArg.Flags[0],
|
|
|
|
OrigArg.IsFixed, OrigArg.OrigValue);
|
2021-02-28 11:12:08 -05:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create one ArgInfo for each virtual register in the original ArgInfo.
|
|
|
|
assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
|
|
|
|
|
|
|
|
bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
|
2021-06-09 16:36:39 +00:00
|
|
|
OrigArg.Ty, CallConv, false, DL);
|
2021-02-28 11:12:08 -05:00
|
|
|
for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
|
|
|
|
Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
|
2021-07-08 11:26:30 -04:00
|
|
|
SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.OrigArgIndex,
|
|
|
|
OrigArg.Flags[0], OrigArg.IsFixed);
|
2021-02-28 11:12:08 -05:00
|
|
|
if (NeedsRegBlock)
|
|
|
|
SplitArgs.back().Flags[0].setInConsecutiveRegs();
|
|
|
|
}
|
|
|
|
|
|
|
|
SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
|
|
|
|
}
|
|
|
|
|
2021-02-07 12:12:31 -05:00
|
|
|
/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
|
|
|
|
static MachineInstrBuilder
|
|
|
|
mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
|
|
|
|
ArrayRef<Register> SrcRegs) {
|
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
LLT LLTy = MRI.getType(DstRegs[0]);
|
|
|
|
LLT PartLLT = MRI.getType(SrcRegs[0]);
|
|
|
|
|
|
|
|
// Deal with v3s16 split into v2s16
|
|
|
|
LLT LCMTy = getLCMType(LLTy, PartLLT);
|
|
|
|
if (LCMTy == LLTy) {
|
|
|
|
// Common case where no padding is needed.
|
|
|
|
assert(DstRegs.size() == 1);
|
|
|
|
return B.buildConcatVectors(DstRegs[0], SrcRegs);
|
|
|
|
}
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
// We need to create an unmerge to the result registers, which may require
|
|
|
|
// widening the original value.
|
|
|
|
Register UnmergeSrcReg;
|
|
|
|
if (LCMTy != PartLLT) {
|
|
|
|
// e.g. A <3 x s16> value was split to <2 x s16>
|
|
|
|
// %register_value0:_(<2 x s16>)
|
|
|
|
// %register_value1:_(<2 x s16>)
|
|
|
|
// %undef:_(<2 x s16>) = G_IMPLICIT_DEF
|
|
|
|
// %concat:_<6 x s16>) = G_CONCAT_VECTORS %reg_value0, %reg_value1, %undef
|
|
|
|
// %dst_reg:_(<3 x s16>), %dead:_(<3 x s16>) = G_UNMERGE_VALUES %concat
|
|
|
|
const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
|
|
|
|
Register Undef = B.buildUndef(PartLLT).getReg(0);
|
|
|
|
|
|
|
|
// Build vector of undefs.
|
|
|
|
SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
|
|
|
|
|
|
|
|
// Replace the first sources with the real registers.
|
|
|
|
std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
|
|
|
|
UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0);
|
|
|
|
} else {
|
|
|
|
// We don't need to widen anything if we're extracting a scalar which was
|
|
|
|
// promoted to a vector e.g. s8 -> v4s8 -> s8
|
|
|
|
assert(SrcRegs.size() == 1);
|
|
|
|
UnmergeSrcReg = SrcRegs[0];
|
|
|
|
}
|
2021-02-07 12:12:31 -05:00
|
|
|
|
|
|
|
int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
|
|
|
|
|
|
|
|
SmallVector<Register, 8> PadDstRegs(NumDst);
|
|
|
|
std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
|
|
|
|
|
|
|
|
// Create the excess dead defs for the unmerge.
|
|
|
|
for (int I = DstRegs.size(); I != NumDst; ++I)
|
|
|
|
PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
return B.buildUnmerge(PadDstRegs, UnmergeSrcReg);
|
2021-02-07 12:12:31 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Create a sequence of instructions to combine pieces split into register
|
|
|
|
/// typed values to the original IR value. \p OrigRegs contains the destination
|
|
|
|
/// value registers of type \p LLTy, and \p Regs contains the legalized pieces
|
2021-02-09 12:09:20 -05:00
|
|
|
/// with type \p PartLLT. This is used for incoming values (physregs to vregs).
|
|
|
|
static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT,
|
|
|
|
const ISD::ArgFlagsTy Flags) {
|
2021-02-07 12:12:31 -05:00
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
if (PartLLT == LLTy) {
|
|
|
|
// We should have avoided introducing a new virtual register, and just
|
|
|
|
// directly assigned here.
|
|
|
|
assert(OrigRegs[0] == Regs[0]);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PartLLT.getSizeInBits() == LLTy.getSizeInBits() && OrigRegs.size() == 1 &&
|
|
|
|
Regs.size() == 1) {
|
|
|
|
B.buildBitcast(OrigRegs[0], Regs[0]);
|
|
|
|
return;
|
|
|
|
}
|
2021-02-09 12:09:20 -05:00
|
|
|
|
2021-05-14 16:21:53 -07:00
|
|
|
// A vector PartLLT needs extending to LLTy's element size.
|
|
|
|
// E.g. <2 x s64> = G_SEXT <2 x s32>.
|
2021-02-09 12:09:20 -05:00
|
|
|
if (PartLLT.isVector() == LLTy.isVector() &&
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits() &&
|
2021-05-14 16:21:53 -07:00
|
|
|
(!PartLLT.isVector() ||
|
|
|
|
PartLLT.getNumElements() == LLTy.getNumElements()) &&
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
OrigRegs.size() == 1 && Regs.size() == 1) {
|
|
|
|
Register SrcReg = Regs[0];
|
|
|
|
|
|
|
|
LLT LocTy = MRI.getType(SrcReg);
|
|
|
|
|
|
|
|
if (Flags.isSExt()) {
|
2021-05-14 16:21:53 -07:00
|
|
|
SrcReg = B.buildAssertSExt(LocTy, SrcReg, LLTy.getScalarSizeInBits())
|
|
|
|
.getReg(0);
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
} else if (Flags.isZExt()) {
|
2021-05-14 16:21:53 -07:00
|
|
|
SrcReg = B.buildAssertZExt(LocTy, SrcReg, LLTy.getScalarSizeInBits())
|
|
|
|
.getReg(0);
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
}
|
|
|
|
|
2021-07-14 14:03:18 -04:00
|
|
|
// Sometimes pointers are passed zero extended.
|
|
|
|
LLT OrigTy = MRI.getType(OrigRegs[0]);
|
|
|
|
if (OrigTy.isPointer()) {
|
|
|
|
LLT IntPtrTy = LLT::scalar(OrigTy.getSizeInBits());
|
|
|
|
B.buildIntToPtr(OrigRegs[0], B.buildTrunc(IntPtrTy, SrcReg));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
B.buildTrunc(OrigRegs[0], SrcReg);
|
2021-02-09 12:09:20 -05:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-02-07 12:12:31 -05:00
|
|
|
if (!LLTy.isVector() && !PartLLT.isVector()) {
|
|
|
|
assert(OrigRegs.size() == 1);
|
|
|
|
LLT OrigTy = MRI.getType(OrigRegs[0]);
|
|
|
|
|
2021-06-27 16:07:19 +01:00
|
|
|
unsigned SrcSize = PartLLT.getSizeInBits().getFixedSize() * Regs.size();
|
2021-02-07 12:12:31 -05:00
|
|
|
if (SrcSize == OrigTy.getSizeInBits())
|
|
|
|
B.buildMerge(OrigRegs[0], Regs);
|
|
|
|
else {
|
|
|
|
auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
|
|
|
|
B.buildTrunc(OrigRegs[0], Widened);
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
if (PartLLT.isVector()) {
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
assert(OrigRegs.size() == 1);
|
2021-05-14 16:21:53 -07:00
|
|
|
SmallVector<Register> CastRegs(Regs.begin(), Regs.end());
|
|
|
|
|
|
|
|
// If PartLLT is a mismatched vector in both number of elements and element
|
|
|
|
// size, e.g. PartLLT == v2s64 and LLTy is v3s32, then first coerce it to
|
|
|
|
// have the same elt type, i.e. v4s32.
|
|
|
|
if (PartLLT.getSizeInBits() > LLTy.getSizeInBits() &&
|
|
|
|
PartLLT.getScalarSizeInBits() == LLTy.getScalarSizeInBits() * 2 &&
|
|
|
|
Regs.size() == 1) {
|
|
|
|
LLT NewTy = PartLLT.changeElementType(LLTy.getElementType())
|
2021-06-25 11:27:41 +01:00
|
|
|
.changeElementCount(PartLLT.getElementCount() * 2);
|
2021-05-14 16:21:53 -07:00
|
|
|
CastRegs[0] = B.buildBitcast(NewTy, Regs[0]).getReg(0);
|
|
|
|
PartLLT = NewTy;
|
|
|
|
}
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
|
|
|
|
if (LLTy.getScalarType() == PartLLT.getElementType()) {
|
2021-05-14 16:21:53 -07:00
|
|
|
mergeVectorRegsToResultRegs(B, OrigRegs, CastRegs);
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
} else {
|
|
|
|
unsigned I = 0;
|
|
|
|
LLT GCDTy = getGCDType(LLTy, PartLLT);
|
|
|
|
|
|
|
|
// We are both splitting a vector, and bitcasting its element types. Cast
|
|
|
|
// the source pieces into the appropriate number of pieces with the result
|
|
|
|
// element type.
|
2021-05-14 16:21:53 -07:00
|
|
|
for (Register SrcReg : CastRegs)
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
CastRegs[I++] = B.buildBitcast(GCDTy, SrcReg).getReg(0);
|
|
|
|
mergeVectorRegsToResultRegs(B, OrigRegs, CastRegs);
|
|
|
|
}
|
|
|
|
|
2021-02-07 12:12:31 -05:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(LLTy.isVector() && !PartLLT.isVector());
|
|
|
|
|
|
|
|
LLT DstEltTy = LLTy.getElementType();
|
|
|
|
|
|
|
|
// Pointer information was discarded. We'll need to coerce some register types
|
|
|
|
// to avoid violating type constraints.
|
|
|
|
LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
|
|
|
|
|
|
|
|
assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
|
|
|
|
|
|
|
|
if (DstEltTy == PartLLT) {
|
|
|
|
// Vector was trivially scalarized.
|
|
|
|
|
|
|
|
if (RealDstEltTy.isPointer()) {
|
|
|
|
for (Register Reg : Regs)
|
|
|
|
MRI.setType(Reg, RealDstEltTy);
|
|
|
|
}
|
|
|
|
|
|
|
|
B.buildBuildVector(OrigRegs[0], Regs);
|
|
|
|
} else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
|
|
|
|
// Deal with vector with 64-bit elements decomposed to 32-bit
|
|
|
|
// registers. Need to create intermediate 64-bit elements.
|
|
|
|
SmallVector<Register, 8> EltMerges;
|
|
|
|
int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
|
|
|
|
|
|
|
|
assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
|
|
|
|
|
|
|
|
for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
|
|
|
|
auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
|
|
|
|
// Fix the type in case this is really a vector of pointers.
|
|
|
|
MRI.setType(Merge.getReg(0), RealDstEltTy);
|
|
|
|
EltMerges.push_back(Merge.getReg(0));
|
|
|
|
Regs = Regs.drop_front(PartsPerElt);
|
|
|
|
}
|
|
|
|
|
|
|
|
B.buildBuildVector(OrigRegs[0], EltMerges);
|
|
|
|
} else {
|
|
|
|
// Vector was split, and elements promoted to a wider type.
|
|
|
|
// FIXME: Should handle floating point promotions.
|
2021-06-24 09:58:21 +01:00
|
|
|
LLT BVType = LLT::fixed_vector(LLTy.getNumElements(), PartLLT);
|
2021-02-07 12:12:31 -05:00
|
|
|
auto BV = B.buildBuildVector(BVType, Regs);
|
|
|
|
B.buildTrunc(OrigRegs[0], BV);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
/// Create a sequence of instructions to expand the value in \p SrcReg (of type
|
|
|
|
/// \p SrcTy) to the types in \p DstRegs (of type \p PartTy). \p ExtendOp should
|
|
|
|
/// contain the type of scalar value extension if necessary.
|
|
|
|
///
|
|
|
|
/// This is used for outgoing values (vregs to physregs)
|
|
|
|
static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
|
|
|
|
Register SrcReg, LLT SrcTy, LLT PartTy,
|
|
|
|
unsigned ExtendOp = TargetOpcode::G_ANYEXT) {
|
|
|
|
// We could just insert a regular copy, but this is unreachable at the moment.
|
|
|
|
assert(SrcTy != PartTy && "identical part types shouldn't reach here");
|
|
|
|
|
|
|
|
const unsigned PartSize = PartTy.getSizeInBits();
|
|
|
|
|
|
|
|
if (PartTy.isVector() == SrcTy.isVector() &&
|
|
|
|
PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) {
|
|
|
|
assert(DstRegs.size() == 1);
|
|
|
|
B.buildInstr(ExtendOp, {DstRegs[0]}, {SrcReg});
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (SrcTy.isVector() && !PartTy.isVector() &&
|
|
|
|
PartSize > SrcTy.getElementType().getSizeInBits()) {
|
|
|
|
// Vector was scalarized, and the elements extended.
|
|
|
|
auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
|
|
|
|
for (int i = 0, e = DstRegs.size(); i != e; ++i)
|
|
|
|
B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
LLT GCDTy = getGCDType(SrcTy, PartTy);
|
|
|
|
if (GCDTy == PartTy) {
|
|
|
|
// If this already evenly divisible, we can create a simple unmerge.
|
|
|
|
B.buildUnmerge(DstRegs, SrcReg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
LLT DstTy = MRI.getType(DstRegs[0]);
|
|
|
|
LLT LCMTy = getLCMType(SrcTy, PartTy);
|
|
|
|
|
|
|
|
const unsigned DstSize = DstTy.getSizeInBits();
|
|
|
|
const unsigned SrcSize = SrcTy.getSizeInBits();
|
2021-07-11 09:00:14 -04:00
|
|
|
unsigned CoveringSize = LCMTy.getSizeInBits();
|
2021-02-09 12:09:20 -05:00
|
|
|
|
|
|
|
Register UnmergeSrc = SrcReg;
|
2021-07-11 09:00:14 -04:00
|
|
|
|
|
|
|
if (CoveringSize != SrcSize) {
|
|
|
|
// For scalars, it's common to be able to use a simple extension.
|
|
|
|
if (SrcTy.isScalar() && DstTy.isScalar()) {
|
|
|
|
CoveringSize = alignTo(SrcSize, DstSize);
|
|
|
|
LLT CoverTy = LLT::scalar(CoveringSize);
|
|
|
|
UnmergeSrc = B.buildInstr(ExtendOp, {CoverTy}, {SrcReg}).getReg(0);
|
|
|
|
} else {
|
|
|
|
// Widen to the common type.
|
|
|
|
// FIXME: This should respect the extend type
|
|
|
|
Register Undef = B.buildUndef(SrcTy).getReg(0);
|
|
|
|
SmallVector<Register, 8> MergeParts(1, SrcReg);
|
|
|
|
for (unsigned Size = SrcSize; Size != CoveringSize; Size += SrcSize)
|
|
|
|
MergeParts.push_back(Undef);
|
|
|
|
UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
|
|
|
|
}
|
2021-02-09 12:09:20 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// Unmerge to the original registers and pad with dead defs.
|
|
|
|
SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
|
2021-07-11 09:00:14 -04:00
|
|
|
for (unsigned Size = DstSize * DstRegs.size(); Size != CoveringSize;
|
2021-02-09 12:09:20 -05:00
|
|
|
Size += DstSize) {
|
|
|
|
UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
|
|
|
|
}
|
|
|
|
|
|
|
|
B.buildUnmerge(UnmergeResults, UnmergeSrc);
|
|
|
|
}
|
|
|
|
|
2021-05-04 18:12:38 -04:00
|
|
|
bool CallLowering::determineAndHandleAssignments(
|
|
|
|
ValueHandler &Handler, ValueAssigner &Assigner,
|
|
|
|
SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
|
|
|
|
CallingConv::ID CallConv, bool IsVarArg, Register ThisReturnReg) const {
|
2016-12-05 10:40:33 +00:00
|
|
|
MachineFunction &MF = MIRBuilder.getMF();
|
2017-12-15 22:22:58 +00:00
|
|
|
const Function &F = MF.getFunction();
|
2016-12-05 10:40:33 +00:00
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
2021-02-08 17:15:29 -05:00
|
|
|
|
|
|
|
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, F.getContext());
|
2021-05-04 18:12:38 -04:00
|
|
|
if (!determineAssignments(Assigner, Args, CCInfo))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder,
|
2021-02-03 10:33:48 -08:00
|
|
|
ThisReturnReg);
|
2019-07-16 22:41:34 +00:00
|
|
|
}
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
|
|
|
|
if (Flags.isSExt())
|
|
|
|
return TargetOpcode::G_SEXT;
|
|
|
|
if (Flags.isZExt())
|
|
|
|
return TargetOpcode::G_ZEXT;
|
|
|
|
return TargetOpcode::G_ANYEXT;
|
|
|
|
}
|
|
|
|
|
2021-05-04 18:12:38 -04:00
|
|
|
bool CallLowering::determineAssignments(ValueAssigner &Assigner,
|
|
|
|
SmallVectorImpl<ArgInfo> &Args,
|
|
|
|
CCState &CCInfo) const {
|
|
|
|
LLVMContext &Ctx = CCInfo.getContext();
|
|
|
|
const CallingConv::ID CallConv = CCInfo.getCallingConv();
|
2016-12-05 10:40:33 +00:00
|
|
|
|
|
|
|
unsigned NumArgs = Args.size();
|
|
|
|
for (unsigned i = 0; i != NumArgs; ++i) {
|
2020-07-07 15:21:13 -04:00
|
|
|
EVT CurVT = EVT::getEVT(Args[i].Ty);
|
2019-09-03 21:42:28 +00:00
|
|
|
|
2021-05-04 18:12:38 -04:00
|
|
|
MVT NewVT = TLI->getRegisterTypeForCallingConv(Ctx, CallConv, CurVT);
|
2020-08-17 13:42:59 -04:00
|
|
|
|
|
|
|
// If we need to split the type over multiple regs, check it's a scenario
|
|
|
|
// we currently support.
|
2021-05-04 18:12:38 -04:00
|
|
|
unsigned NumParts =
|
|
|
|
TLI->getNumRegistersForCallingConv(Ctx, CallConv, CurVT);
|
2020-12-15 10:39:27 -05:00
|
|
|
|
|
|
|
if (NumParts == 1) {
|
|
|
|
// Try to use the register type if we couldn't assign the VT.
|
2021-05-04 18:12:38 -04:00
|
|
|
if (Assigner.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i],
|
|
|
|
Args[i].Flags[0], CCInfo))
|
2020-08-17 13:42:59 -04:00
|
|
|
return false;
|
2020-12-15 10:39:27 -05:00
|
|
|
continue;
|
2020-08-17 13:42:59 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
// For incoming arguments (physregs to vregs), we could have values in
|
|
|
|
// physregs (or memlocs) which we want to extract and copy to vregs.
|
|
|
|
// During this, we might have to deal with the LLT being split across
|
|
|
|
// multiple regs, so we have to record this information for later.
|
|
|
|
//
|
|
|
|
// If we have outgoing args, then we have the opposite case. We have a
|
|
|
|
// vreg with an LLT which we want to assign to a physical location, and
|
|
|
|
// we might have to record that the value has to be split later.
|
2021-04-12 21:40:23 -04:00
|
|
|
|
|
|
|
// We're handling an incoming arg which is split over multiple regs.
|
|
|
|
// E.g. passing an s128 on AArch64.
|
|
|
|
ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
|
|
|
|
Args[i].Flags.clear();
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < NumParts; ++Part) {
|
|
|
|
ISD::ArgFlagsTy Flags = OrigFlags;
|
|
|
|
if (Part == 0) {
|
|
|
|
Flags.setSplit();
|
|
|
|
} else {
|
|
|
|
Flags.setOrigAlign(Align(1));
|
|
|
|
if (Part == NumParts - 1)
|
|
|
|
Flags.setSplitEnd();
|
2019-09-03 21:42:28 +00:00
|
|
|
}
|
2021-02-03 10:33:48 -08:00
|
|
|
|
2021-05-04 18:12:38 -04:00
|
|
|
if (!Assigner.isIncomingArgumentHandler()) {
|
2021-02-03 10:33:48 -08:00
|
|
|
// TODO: Also check if there is a valid extension that preserves the
|
|
|
|
// bits. However currently this call lowering doesn't support non-exact
|
|
|
|
// split parts, so that can't be tested.
|
|
|
|
if (OrigFlags.isReturned() &&
|
|
|
|
(NumParts * NewVT.getSizeInBits() != CurVT.getSizeInBits())) {
|
|
|
|
Flags.setReturned(false);
|
|
|
|
}
|
2021-04-12 21:40:23 -04:00
|
|
|
}
|
2021-02-03 10:33:48 -08:00
|
|
|
|
2021-04-12 21:40:23 -04:00
|
|
|
Args[i].Flags.push_back(Flags);
|
2021-05-04 18:12:38 -04:00
|
|
|
if (Assigner.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i],
|
|
|
|
Args[i].Flags[Part], CCInfo)) {
|
2021-04-12 21:40:23 -04:00
|
|
|
// Still couldn't assign this smaller part type for some reason.
|
|
|
|
return false;
|
2020-08-17 13:42:59 -04:00
|
|
|
}
|
2019-04-09 21:22:33 +00:00
|
|
|
}
|
2016-12-05 10:40:33 +00:00
|
|
|
}
|
|
|
|
|
2021-05-04 18:12:38 -04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CallLowering::handleAssignments(ValueHandler &Handler,
|
|
|
|
SmallVectorImpl<ArgInfo> &Args,
|
|
|
|
CCState &CCInfo,
|
|
|
|
SmallVectorImpl<CCValAssign> &ArgLocs,
|
|
|
|
MachineIRBuilder &MIRBuilder,
|
|
|
|
Register ThisReturnReg) const {
|
|
|
|
MachineFunction &MF = MIRBuilder.getMF();
|
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
const Function &F = MF.getFunction();
|
|
|
|
const DataLayout &DL = F.getParent()->getDataLayout();
|
|
|
|
|
|
|
|
const unsigned NumArgs = Args.size();
|
|
|
|
|
2021-04-12 21:40:23 -04:00
|
|
|
for (unsigned i = 0, j = 0; i != NumArgs; ++i, ++j) {
|
2017-02-16 07:53:07 +00:00
|
|
|
assert(j < ArgLocs.size() && "Skipped too many arg locs");
|
|
|
|
CCValAssign &VA = ArgLocs[j];
|
|
|
|
assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
|
|
|
|
|
|
|
|
if (VA.needsCustom()) {
|
[ARM] Supporting lowering of half-precision FP arguments and returns in AArch32's backend
Summary:
Half-precision floating point arguments and returns are currently
promoted to either float or int32 in clang's CodeGen and there's
no existing support for the lowering of `half` arguments and returns
from IR in AArch32's backend.
Such frontend coercions, implemented as coercion through memory
in clang, can cause a series of issues in argument lowering, as causing
arguments to be stored on the wrong bits on big-endian architectures
and incurring in missing overflow detections in the return of certain
functions.
This patch introduces the handling of half-precision arguments and returns in
the backend using the actual "half" type on the IR. Using the "half"
type the backend is able to properly enforce the AAPCS' directions for
those arguments, making sure they are stored on the proper bits of the
registers and performing the necessary floating point convertions.
Reviewers: rjmccall, olista01, asl, efriedma, ostannard, SjoerdMeijer
Reviewed By: ostannard
Subscribers: stuij, hiraditya, dmgreen, llvm-commits, chill, dnsampaio, danielkiss, kristof.beyls, cfe-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D75169
2020-06-09 09:45:47 +01:00
|
|
|
unsigned NumArgRegs =
|
|
|
|
Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
|
|
|
|
if (!NumArgRegs)
|
|
|
|
return false;
|
|
|
|
j += NumArgRegs;
|
2017-02-16 07:53:07 +00:00
|
|
|
continue;
|
|
|
|
}
|
2016-12-05 10:40:33 +00:00
|
|
|
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
const MVT ValVT = VA.getValVT();
|
|
|
|
const MVT LocVT = VA.getLocVT();
|
|
|
|
|
|
|
|
const LLT LocTy(LocVT);
|
|
|
|
const LLT ValTy(ValVT);
|
|
|
|
const LLT NewLLT = Handler.isIncomingArgumentHandler() ? LocTy : ValTy;
|
|
|
|
const EVT OrigVT = EVT::getEVT(Args[i].Ty);
|
2020-07-07 15:21:13 -04:00
|
|
|
const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
|
|
|
|
|
2020-07-08 09:11:53 -04:00
|
|
|
// Expected to be multiple regs for a single incoming arg.
|
|
|
|
// There should be Regs.size() ArgLocs per argument.
|
2021-04-12 21:40:23 -04:00
|
|
|
// This should be the same as getNumRegistersForCallingConv
|
|
|
|
const unsigned NumParts = Args[i].Flags.size();
|
|
|
|
|
|
|
|
// Now split the registers into the assigned types.
|
|
|
|
Args[i].OrigRegs.assign(Args[i].Regs.begin(), Args[i].Regs.end());
|
|
|
|
|
|
|
|
if (NumParts != 1 || NewLLT != OrigTy) {
|
|
|
|
// If we can't directly assign the register, we need one or more
|
|
|
|
// intermediate values.
|
|
|
|
Args[i].Regs.resize(NumParts);
|
|
|
|
|
|
|
|
// For each split register, create and assign a vreg that will store
|
|
|
|
// the incoming component of the larger value. These will later be
|
|
|
|
// merged to form the final vreg.
|
|
|
|
for (unsigned Part = 0; Part < NumParts; ++Part)
|
|
|
|
Args[i].Regs[Part] = MRI.createGenericVirtualRegister(NewLLT);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert((j + (NumParts - 1)) < ArgLocs.size() &&
|
2020-07-08 09:11:53 -04:00
|
|
|
"Too many regs for number of args");
|
2021-02-09 12:09:20 -05:00
|
|
|
|
|
|
|
// Coerce into outgoing value types before register assignment.
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
if (!Handler.isIncomingArgumentHandler() && OrigTy != ValTy) {
|
2021-02-09 12:09:20 -05:00
|
|
|
assert(Args[i].OrigRegs.size() == 1);
|
|
|
|
buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
ValTy, extendOpFromFlags(Args[i].Flags[0]));
|
2021-02-09 12:09:20 -05:00
|
|
|
}
|
|
|
|
|
2021-04-12 21:40:23 -04:00
|
|
|
for (unsigned Part = 0; Part < NumParts; ++Part) {
|
2021-02-19 12:50:59 -08:00
|
|
|
Register ArgReg = Args[i].Regs[Part];
|
2020-07-08 09:11:53 -04:00
|
|
|
// There should be Regs.size() ArgLocs per argument.
|
|
|
|
VA = ArgLocs[j + Part];
|
2021-03-05 17:28:32 -05:00
|
|
|
const ISD::ArgFlagsTy Flags = Args[i].Flags[Part];
|
|
|
|
|
|
|
|
if (VA.isMemLoc() && !Flags.isByVal()) {
|
2021-02-07 15:17:04 -05:00
|
|
|
// Individual pieces may have been spilled to the stack and others
|
|
|
|
// passed in registers.
|
2020-07-08 09:11:53 -04:00
|
|
|
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
// TODO: The memory size may be larger than the value we need to
|
|
|
|
// store. We may need to adjust the offset for big endian targets.
|
2021-07-14 14:03:18 -04:00
|
|
|
LLT MemTy = Handler.getStackValueStoreType(DL, VA, Flags);
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
|
2020-07-08 09:11:53 -04:00
|
|
|
MachinePointerInfo MPO;
|
2021-06-10 17:31:30 -04:00
|
|
|
Register StackAddr = Handler.getStackAddress(
|
|
|
|
MemTy.getSizeInBytes(), VA.getLocMemOffset(), MPO, Flags);
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
|
2021-06-10 17:31:30 -04:00
|
|
|
Handler.assignValueToAddress(Args[i], Part, StackAddr, MemTy, MPO, VA);
|
2020-07-08 09:11:53 -04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-03-05 17:28:32 -05:00
|
|
|
if (VA.isMemLoc() && Flags.isByVal()) {
|
|
|
|
assert(Args[i].Regs.size() == 1 &&
|
|
|
|
"didn't expect split byval pointer");
|
2021-03-12 11:06:18 -05:00
|
|
|
|
|
|
|
if (Handler.isIncomingArgumentHandler()) {
|
|
|
|
// We just need to copy the frame index value to the pointer.
|
|
|
|
MachinePointerInfo MPO;
|
|
|
|
Register StackAddr = Handler.getStackAddress(
|
|
|
|
Flags.getByValSize(), VA.getLocMemOffset(), MPO, Flags);
|
|
|
|
MIRBuilder.buildCopy(Args[i].Regs[0], StackAddr);
|
|
|
|
} else {
|
|
|
|
// For outgoing byval arguments, insert the implicit copy byval
|
|
|
|
// implies, such that writes in the callee do not modify the caller's
|
|
|
|
// value.
|
|
|
|
uint64_t MemSize = Flags.getByValSize();
|
|
|
|
int64_t Offset = VA.getLocMemOffset();
|
|
|
|
|
|
|
|
MachinePointerInfo DstMPO;
|
|
|
|
Register StackAddr =
|
|
|
|
Handler.getStackAddress(MemSize, Offset, DstMPO, Flags);
|
|
|
|
|
2021-03-14 10:26:31 -04:00
|
|
|
MachinePointerInfo SrcMPO(Args[i].OrigValue);
|
|
|
|
if (!Args[i].OrigValue) {
|
|
|
|
// We still need to accurately track the stack address space if we
|
|
|
|
// don't know the underlying value.
|
|
|
|
const LLT PtrTy = MRI.getType(StackAddr);
|
|
|
|
SrcMPO = MachinePointerInfo(PtrTy.getAddressSpace());
|
|
|
|
}
|
2021-03-12 11:06:18 -05:00
|
|
|
|
|
|
|
Align DstAlign = std::max(Flags.getNonZeroByValAlign(),
|
|
|
|
inferAlignFromPtrInfo(MF, DstMPO));
|
|
|
|
|
2021-03-14 10:26:31 -04:00
|
|
|
Align SrcAlign = std::max(Flags.getNonZeroByValAlign(),
|
|
|
|
inferAlignFromPtrInfo(MF, SrcMPO));
|
2021-03-12 11:06:18 -05:00
|
|
|
|
|
|
|
Handler.copyArgumentMemory(Args[i], StackAddr, Args[i].Regs[0],
|
|
|
|
DstMPO, DstAlign, SrcMPO, SrcAlign,
|
|
|
|
MemSize, VA);
|
|
|
|
}
|
2021-03-05 17:28:32 -05:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!VA.needsCustom() && "custom loc should have been handled already");
|
2020-07-08 09:11:53 -04:00
|
|
|
|
2021-02-03 10:33:48 -08:00
|
|
|
if (i == 0 && ThisReturnReg.isValid() &&
|
|
|
|
Handler.isIncomingArgumentHandler() &&
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
isTypeIsValidForThisReturn(ValVT)) {
|
2021-02-03 10:33:48 -08:00
|
|
|
Handler.assignValueToReg(Args[i].Regs[i], ThisReturnReg, VA);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
|
2020-07-08 09:11:53 -04:00
|
|
|
}
|
|
|
|
|
2021-02-09 12:09:20 -05:00
|
|
|
// Now that all pieces have been assigned, re-pack the register typed values
|
|
|
|
// into the original value typed registers.
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
if (Handler.isIncomingArgumentHandler() && OrigVT != LocVT) {
|
2021-02-07 12:12:31 -05:00
|
|
|
// Merge the split registers into the expected larger result vregs of
|
|
|
|
// the original call.
|
2021-02-09 12:09:20 -05:00
|
|
|
buildCopyFromRegs(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
LocTy, Args[i].Flags[0]);
|
2016-12-05 10:40:33 +00:00
|
|
|
}
|
2020-07-08 09:11:53 -04:00
|
|
|
|
2021-04-12 21:40:23 -04:00
|
|
|
j += NumParts - 1;
|
2016-12-05 10:40:33 +00:00
|
|
|
}
|
2020-07-08 09:11:53 -04:00
|
|
|
|
2016-12-05 10:40:33 +00:00
|
|
|
return true;
|
2019-09-10 23:25:12 +00:00
|
|
|
}
|
|
|
|
|
2020-12-23 12:22:36 +05:30
|
|
|
void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
|
|
|
|
ArrayRef<Register> VRegs, Register DemoteReg,
|
|
|
|
int FI) const {
|
|
|
|
MachineFunction &MF = MIRBuilder.getMF();
|
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
const DataLayout &DL = MF.getDataLayout();
|
|
|
|
|
|
|
|
SmallVector<EVT, 4> SplitVTs;
|
|
|
|
SmallVector<uint64_t, 4> Offsets;
|
|
|
|
ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
|
|
|
|
|
|
|
|
assert(VRegs.size() == SplitVTs.size());
|
|
|
|
|
|
|
|
unsigned NumValues = SplitVTs.size();
|
|
|
|
Align BaseAlign = DL.getPrefTypeAlign(RetTy);
|
|
|
|
Type *RetPtrTy = RetTy->getPointerTo(DL.getAllocaAddrSpace());
|
|
|
|
LLT OffsetLLTy = getLLTForType(*DL.getIntPtrType(RetPtrTy), DL);
|
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
|
|
|
|
|
|
|
|
for (unsigned I = 0; I < NumValues; ++I) {
|
|
|
|
Register Addr;
|
|
|
|
MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
|
|
|
|
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
|
2021-07-16 12:55:41 -04:00
|
|
|
MRI.getType(VRegs[I]),
|
2020-12-23 12:22:36 +05:30
|
|
|
commonAlignment(BaseAlign, Offsets[I]));
|
|
|
|
MIRBuilder.buildLoad(VRegs[I], Addr, *MMO);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
|
|
|
|
ArrayRef<Register> VRegs,
|
|
|
|
Register DemoteReg) const {
|
|
|
|
MachineFunction &MF = MIRBuilder.getMF();
|
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
const DataLayout &DL = MF.getDataLayout();
|
|
|
|
|
|
|
|
SmallVector<EVT, 4> SplitVTs;
|
|
|
|
SmallVector<uint64_t, 4> Offsets;
|
|
|
|
ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
|
|
|
|
|
|
|
|
assert(VRegs.size() == SplitVTs.size());
|
|
|
|
|
|
|
|
unsigned NumValues = SplitVTs.size();
|
|
|
|
Align BaseAlign = DL.getPrefTypeAlign(RetTy);
|
|
|
|
unsigned AS = DL.getAllocaAddrSpace();
|
|
|
|
LLT OffsetLLTy =
|
|
|
|
getLLTForType(*DL.getIntPtrType(RetTy->getPointerTo(AS)), DL);
|
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo(AS);
|
|
|
|
|
|
|
|
for (unsigned I = 0; I < NumValues; ++I) {
|
|
|
|
Register Addr;
|
|
|
|
MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
|
|
|
|
auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
|
2021-07-16 12:55:41 -04:00
|
|
|
MRI.getType(VRegs[I]),
|
2020-12-23 12:22:36 +05:30
|
|
|
commonAlignment(BaseAlign, Offsets[I]));
|
|
|
|
MIRBuilder.buildStore(VRegs[I], Addr, *MMO);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CallLowering::insertSRetIncomingArgument(
|
|
|
|
const Function &F, SmallVectorImpl<ArgInfo> &SplitArgs, Register &DemoteReg,
|
|
|
|
MachineRegisterInfo &MRI, const DataLayout &DL) const {
|
|
|
|
unsigned AS = DL.getAllocaAddrSpace();
|
|
|
|
DemoteReg = MRI.createGenericVirtualRegister(
|
|
|
|
LLT::pointer(AS, DL.getPointerSizeInBits(AS)));
|
|
|
|
|
|
|
|
Type *PtrTy = PointerType::get(F.getReturnType(), AS);
|
|
|
|
|
|
|
|
SmallVector<EVT, 1> ValueVTs;
|
|
|
|
ComputeValueVTs(*TLI, DL, PtrTy, ValueVTs);
|
|
|
|
|
|
|
|
// NOTE: Assume that a pointer won't get split into more than one VT.
|
|
|
|
assert(ValueVTs.size() == 1);
|
|
|
|
|
2021-07-08 11:26:30 -04:00
|
|
|
ArgInfo DemoteArg(DemoteReg, ValueVTs[0].getTypeForEVT(PtrTy->getContext()),
|
|
|
|
ArgInfo::NoArgIndex);
|
2020-12-23 12:22:36 +05:30
|
|
|
setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, F);
|
|
|
|
DemoteArg.Flags[0].setSRet();
|
|
|
|
SplitArgs.insert(SplitArgs.begin(), DemoteArg);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CallLowering::insertSRetOutgoingArgument(MachineIRBuilder &MIRBuilder,
|
|
|
|
const CallBase &CB,
|
|
|
|
CallLoweringInfo &Info) const {
|
|
|
|
const DataLayout &DL = MIRBuilder.getDataLayout();
|
|
|
|
Type *RetTy = CB.getType();
|
|
|
|
unsigned AS = DL.getAllocaAddrSpace();
|
|
|
|
LLT FramePtrTy = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
|
|
|
|
|
|
|
|
int FI = MIRBuilder.getMF().getFrameInfo().CreateStackObject(
|
|
|
|
DL.getTypeAllocSize(RetTy), DL.getPrefTypeAlign(RetTy), false);
|
|
|
|
|
|
|
|
Register DemoteReg = MIRBuilder.buildFrameIndex(FramePtrTy, FI).getReg(0);
|
2021-07-08 11:26:30 -04:00
|
|
|
ArgInfo DemoteArg(DemoteReg, PointerType::get(RetTy, AS),
|
|
|
|
ArgInfo::NoArgIndex);
|
2020-12-23 12:22:36 +05:30
|
|
|
setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, CB);
|
|
|
|
DemoteArg.Flags[0].setSRet();
|
|
|
|
|
|
|
|
Info.OrigArgs.insert(Info.OrigArgs.begin(), DemoteArg);
|
|
|
|
Info.DemoteStackIndex = FI;
|
|
|
|
Info.DemoteRegister = DemoteReg;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CallLowering::checkReturn(CCState &CCInfo,
|
|
|
|
SmallVectorImpl<BaseArgInfo> &Outs,
|
|
|
|
CCAssignFn *Fn) const {
|
|
|
|
for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
|
|
|
|
MVT VT = MVT::getVT(Outs[I].Ty);
|
|
|
|
if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CallLowering::getReturnInfo(CallingConv::ID CallConv, Type *RetTy,
|
|
|
|
AttributeList Attrs,
|
|
|
|
SmallVectorImpl<BaseArgInfo> &Outs,
|
|
|
|
const DataLayout &DL) const {
|
|
|
|
LLVMContext &Context = RetTy->getContext();
|
|
|
|
ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
|
|
|
|
|
|
|
|
SmallVector<EVT, 4> SplitVTs;
|
|
|
|
ComputeValueVTs(*TLI, DL, RetTy, SplitVTs);
|
|
|
|
addArgFlagsFromAttributes(Flags, Attrs, AttributeList::ReturnIndex);
|
|
|
|
|
|
|
|
for (EVT VT : SplitVTs) {
|
|
|
|
unsigned NumParts =
|
|
|
|
TLI->getNumRegistersForCallingConv(Context, CallConv, VT);
|
|
|
|
MVT RegVT = TLI->getRegisterTypeForCallingConv(Context, CallConv, VT);
|
|
|
|
Type *PartTy = EVT(RegVT).getTypeForEVT(Context);
|
|
|
|
|
|
|
|
for (unsigned I = 0; I < NumParts; ++I) {
|
|
|
|
Outs.emplace_back(PartTy, Flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CallLowering::checkReturnTypeForCallConv(MachineFunction &MF) const {
|
|
|
|
const auto &F = MF.getFunction();
|
|
|
|
Type *ReturnType = F.getReturnType();
|
|
|
|
CallingConv::ID CallConv = F.getCallingConv();
|
|
|
|
|
|
|
|
SmallVector<BaseArgInfo, 4> SplitArgs;
|
|
|
|
getReturnInfo(CallConv, ReturnType, F.getAttributes(), SplitArgs,
|
|
|
|
MF.getDataLayout());
|
2021-01-06 11:22:23 +05:30
|
|
|
return canLowerReturn(MF, CallConv, SplitArgs, F.isVarArg());
|
2020-12-23 12:22:36 +05:30
|
|
|
}
|
|
|
|
|
2021-01-12 18:58:30 -05:00
|
|
|
bool CallLowering::parametersInCSRMatch(
|
|
|
|
const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask,
|
|
|
|
const SmallVectorImpl<CCValAssign> &OutLocs,
|
|
|
|
const SmallVectorImpl<ArgInfo> &OutArgs) const {
|
|
|
|
for (unsigned i = 0; i < OutLocs.size(); ++i) {
|
|
|
|
auto &ArgLoc = OutLocs[i];
|
|
|
|
// If it's not a register, it's fine.
|
|
|
|
if (!ArgLoc.isRegLoc())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
MCRegister PhysReg = ArgLoc.getLocReg();
|
|
|
|
|
|
|
|
// Only look at callee-saved registers.
|
|
|
|
if (MachineOperand::clobbersPhysReg(CallerPreservedMask, PhysReg))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs()
|
|
|
|
<< "... Call has an argument passed in a callee-saved register.\n");
|
|
|
|
|
|
|
|
// Check if it was copied from.
|
|
|
|
const ArgInfo &OutInfo = OutArgs[i];
|
|
|
|
|
|
|
|
if (OutInfo.Regs.size() > 1) {
|
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << "... Cannot handle arguments in multiple registers.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if we copy the register, walking through copies from virtual
|
|
|
|
// registers. Note that getDefIgnoringCopies does not ignore copies from
|
|
|
|
// physical registers.
|
|
|
|
MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
|
|
|
|
if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
|
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs()
|
|
|
|
<< "... Parameter was not copied into a VReg, cannot tail call.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Got a copy. Verify that it's the same as the register we want.
|
|
|
|
Register CopyRHS = RegDef->getOperand(1).getReg();
|
|
|
|
if (CopyRHS != PhysReg) {
|
|
|
|
LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
|
|
|
|
"VReg, cannot tail call.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-10 23:25:12 +00:00
|
|
|
bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
|
|
|
|
MachineFunction &MF,
|
|
|
|
SmallVectorImpl<ArgInfo> &InArgs,
|
2021-05-04 18:12:38 -04:00
|
|
|
ValueAssigner &CalleeAssigner,
|
|
|
|
ValueAssigner &CallerAssigner) const {
|
2019-09-10 23:25:12 +00:00
|
|
|
const Function &F = MF.getFunction();
|
|
|
|
CallingConv::ID CalleeCC = Info.CallConv;
|
|
|
|
CallingConv::ID CallerCC = F.getCallingConv();
|
|
|
|
|
|
|
|
if (CallerCC == CalleeCC)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs1;
|
2021-05-05 20:25:31 -04:00
|
|
|
CCState CCInfo1(CalleeCC, Info.IsVarArg, MF, ArgLocs1, F.getContext());
|
2021-05-04 18:12:38 -04:00
|
|
|
if (!determineAssignments(CalleeAssigner, InArgs, CCInfo1))
|
2019-09-10 23:25:12 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs2;
|
2021-05-05 20:25:31 -04:00
|
|
|
CCState CCInfo2(CallerCC, F.isVarArg(), MF, ArgLocs2, F.getContext());
|
2021-05-04 18:12:38 -04:00
|
|
|
if (!determineAssignments(CallerAssigner, InArgs, CCInfo2))
|
2019-09-10 23:25:12 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// We need the argument locations to match up exactly. If there's more in
|
|
|
|
// one than the other, then we are done.
|
|
|
|
if (ArgLocs1.size() != ArgLocs2.size())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Make sure that each location is passed in exactly the same way.
|
|
|
|
for (unsigned i = 0, e = ArgLocs1.size(); i < e; ++i) {
|
|
|
|
const CCValAssign &Loc1 = ArgLocs1[i];
|
|
|
|
const CCValAssign &Loc2 = ArgLocs2[i];
|
|
|
|
|
|
|
|
// We need both of them to be the same. So if one is a register and one
|
|
|
|
// isn't, we're done.
|
|
|
|
if (Loc1.isRegLoc() != Loc2.isRegLoc())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (Loc1.isRegLoc()) {
|
|
|
|
// If they don't have the same register location, we're done.
|
|
|
|
if (Loc1.getLocReg() != Loc2.getLocReg())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// They matched, so we can move to the next ArgLoc.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Loc1 wasn't a RegLoc, so they both must be MemLocs. Check if they match.
|
|
|
|
if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2016-12-05 10:40:33 +00:00
|
|
|
}
|
2016-12-13 10:46:12 +00:00
|
|
|
|
2021-06-10 17:31:30 -04:00
|
|
|
LLT CallLowering::ValueHandler::getStackValueStoreType(
|
2021-07-14 14:03:18 -04:00
|
|
|
const DataLayout &DL, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const {
|
2021-06-10 17:31:30 -04:00
|
|
|
const MVT ValVT = VA.getValVT();
|
2021-07-14 14:03:18 -04:00
|
|
|
if (ValVT != MVT::iPTR) {
|
|
|
|
LLT ValTy(ValVT);
|
|
|
|
|
|
|
|
// We lost the pointeriness going through CCValAssign, so try to restore it
|
|
|
|
// based on the flags.
|
|
|
|
if (Flags.isPointer()) {
|
|
|
|
LLT PtrTy = LLT::pointer(Flags.getPointerAddrSpace(),
|
|
|
|
ValTy.getScalarSizeInBits());
|
|
|
|
if (ValVT.isVector())
|
|
|
|
return LLT::vector(ValTy.getElementCount(), PtrTy);
|
|
|
|
return PtrTy;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ValTy;
|
|
|
|
}
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
|
2021-07-14 14:03:18 -04:00
|
|
|
unsigned AddrSpace = Flags.getPointerAddrSpace();
|
|
|
|
return LLT::pointer(AddrSpace, DL.getPointerSize(AddrSpace));
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
}
|
|
|
|
|
2021-03-12 11:06:18 -05:00
|
|
|
void CallLowering::ValueHandler::copyArgumentMemory(
|
|
|
|
const ArgInfo &Arg, Register DstPtr, Register SrcPtr,
|
|
|
|
const MachinePointerInfo &DstPtrInfo, Align DstAlign,
|
|
|
|
const MachinePointerInfo &SrcPtrInfo, Align SrcAlign, uint64_t MemSize,
|
|
|
|
CCValAssign &VA) const {
|
|
|
|
MachineFunction &MF = MIRBuilder.getMF();
|
|
|
|
MachineMemOperand *SrcMMO = MF.getMachineMemOperand(
|
|
|
|
SrcPtrInfo,
|
|
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable, MemSize,
|
|
|
|
SrcAlign);
|
|
|
|
|
|
|
|
MachineMemOperand *DstMMO = MF.getMachineMemOperand(
|
|
|
|
DstPtrInfo,
|
|
|
|
MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable,
|
|
|
|
MemSize, DstAlign);
|
|
|
|
|
|
|
|
const LLT PtrTy = MRI.getType(DstPtr);
|
|
|
|
const LLT SizeTy = LLT::scalar(PtrTy.getSizeInBits());
|
|
|
|
|
|
|
|
auto SizeConst = MIRBuilder.buildConstant(SizeTy, MemSize);
|
|
|
|
MIRBuilder.buildMemCpy(DstPtr, SrcPtr, SizeConst, *DstMMO, *SrcMMO);
|
|
|
|
}
|
|
|
|
|
2019-06-24 16:16:12 +00:00
|
|
|
Register CallLowering::ValueHandler::extendRegister(Register ValReg,
|
2020-04-24 12:05:44 -07:00
|
|
|
CCValAssign &VA,
|
|
|
|
unsigned MaxSizeBits) {
|
2016-12-13 10:46:12 +00:00
|
|
|
LLT LocTy{VA.getLocVT()};
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
LLT ValTy{VA.getValVT()};
|
|
|
|
|
2020-04-24 12:05:44 -07:00
|
|
|
if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
|
2019-04-09 21:22:33 +00:00
|
|
|
return ValReg;
|
2020-04-24 12:05:44 -07:00
|
|
|
|
|
|
|
if (LocTy.isScalar() && MaxSizeBits && MaxSizeBits < LocTy.getSizeInBits()) {
|
|
|
|
if (MaxSizeBits <= ValTy.getSizeInBits())
|
|
|
|
return ValReg;
|
|
|
|
LocTy = LLT::scalar(MaxSizeBits);
|
|
|
|
}
|
|
|
|
|
2021-07-14 14:03:18 -04:00
|
|
|
const LLT ValRegTy = MRI.getType(ValReg);
|
|
|
|
if (ValRegTy.isPointer()) {
|
|
|
|
// The x32 ABI wants to zero extend 32-bit pointers to 64-bit registers, so
|
|
|
|
// we have to cast to do the extension.
|
|
|
|
LLT IntPtrTy = LLT::scalar(ValRegTy.getSizeInBits());
|
|
|
|
ValReg = MIRBuilder.buildPtrToInt(IntPtrTy, ValReg).getReg(0);
|
|
|
|
}
|
|
|
|
|
2016-12-13 10:46:12 +00:00
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
default: break;
|
|
|
|
case CCValAssign::Full:
|
|
|
|
case CCValAssign::BCvt:
|
|
|
|
// FIXME: bitconverting between vector types may or may not be a
|
|
|
|
// nop in big-endian situations.
|
|
|
|
return ValReg;
|
2017-10-09 20:07:43 +00:00
|
|
|
case CCValAssign::AExt: {
|
|
|
|
auto MIB = MIRBuilder.buildAnyExt(LocTy, ValReg);
|
2020-01-23 11:51:35 +00:00
|
|
|
return MIB.getReg(0);
|
2017-10-09 20:07:43 +00:00
|
|
|
}
|
2016-12-13 10:46:12 +00:00
|
|
|
case CCValAssign::SExt: {
|
2019-07-11 14:18:19 +00:00
|
|
|
Register NewReg = MRI.createGenericVirtualRegister(LocTy);
|
2016-12-13 10:46:12 +00:00
|
|
|
MIRBuilder.buildSExt(NewReg, ValReg);
|
|
|
|
return NewReg;
|
|
|
|
}
|
|
|
|
case CCValAssign::ZExt: {
|
2019-07-11 14:18:19 +00:00
|
|
|
Register NewReg = MRI.createGenericVirtualRegister(LocTy);
|
2016-12-13 10:46:12 +00:00
|
|
|
MIRBuilder.buildZExt(NewReg, ValReg);
|
|
|
|
return NewReg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
llvm_unreachable("unable to extend register");
|
|
|
|
}
|
2018-12-29 02:02:13 +00:00
|
|
|
|
2021-05-04 18:12:38 -04:00
|
|
|
void CallLowering::ValueAssigner::anchor() {}
|
2021-03-02 17:40:50 -05:00
|
|
|
|
|
|
|
Register CallLowering::IncomingValueHandler::buildExtensionHint(CCValAssign &VA,
|
|
|
|
Register SrcReg,
|
|
|
|
LLT NarrowTy) {
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
case CCValAssign::LocInfo::ZExt: {
|
|
|
|
return MIRBuilder
|
|
|
|
.buildAssertZExt(MRI.cloneVirtualRegister(SrcReg), SrcReg,
|
|
|
|
NarrowTy.getScalarSizeInBits())
|
|
|
|
.getReg(0);
|
|
|
|
}
|
|
|
|
case CCValAssign::LocInfo::SExt: {
|
|
|
|
return MIRBuilder
|
|
|
|
.buildAssertSExt(MRI.cloneVirtualRegister(SrcReg), SrcReg,
|
|
|
|
NarrowTy.getScalarSizeInBits())
|
|
|
|
.getReg(0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return SrcReg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
/// Check if we can use a basic COPY instruction between the two types.
|
|
|
|
///
|
|
|
|
/// We're currently building on top of the infrastructure using MVT, which loses
|
|
|
|
/// pointer information in the CCValAssign. We accept copies from physical
|
|
|
|
/// registers that have been reported as integers if it's to an equivalent sized
|
|
|
|
/// pointer LLT.
|
|
|
|
static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) {
|
|
|
|
if (SrcTy == DstTy)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (SrcTy.getSizeInBits() != DstTy.getSizeInBits())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SrcTy = SrcTy.getScalarType();
|
|
|
|
DstTy = DstTy.getScalarType();
|
|
|
|
|
|
|
|
return (SrcTy.isPointer() && DstTy.isScalar()) ||
|
|
|
|
(DstTy.isScalar() && SrcTy.isPointer());
|
|
|
|
}
|
|
|
|
|
2021-03-02 17:40:50 -05:00
|
|
|
void CallLowering::IncomingValueHandler::assignValueToReg(Register ValVReg,
|
|
|
|
Register PhysReg,
|
|
|
|
CCValAssign &VA) {
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
const MVT LocVT = VA.getLocVT();
|
|
|
|
const LLT LocTy(LocVT);
|
|
|
|
const LLT RegTy = MRI.getType(ValVReg);
|
2021-03-02 17:40:50 -05:00
|
|
|
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
if (isCopyCompatibleType(RegTy, LocTy)) {
|
2021-03-02 17:40:50 -05:00
|
|
|
MIRBuilder.buildCopy(ValVReg, PhysReg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg);
|
GlobalISel: Use DAG call lowering infrastructure in a more compatible way
Unfortunately the current call lowering code is built on top of the
legacy MVT/DAG based code. However, GlobalISel was not using it the
same way. In short, the DAG passes legalized types to the assignment
function, and GlobalISel was passing the original raw type if it was
simple.
I do believe the DAG lowering is conceptually broken since it requires
picking a type up front before knowing how/where the value will be
passed. This ends up being a problem for AArch64, which wants to pass
i1/i8/i16 values as a different size if passed on the stack or in
registers.
The argument type decision is split across 3 different places which is
hard to follow. SelectionDAG builder uses
getRegisterTypeForCallingConv to pick a legal type, tablegen gives the
illusion of controlling the type, and the target may have additional
hacks in the C++ part of the call lowering. AArch64 hacks around this
by not using the standard AnalyzeFormalArguments and special casing
i1/i8/i16 by looking at the underlying type of the original IR
argument.
I believe people have generally assumed the calling convention code is
processing the original types, and I've discovered a number of dead
paths in several targets.
x86 actually relies on the opposite behavior from AArch64, and relies
on x86_32 and x86_64 sharing calling convention code where the 64-bit
cases implicitly do not work on x86_32 due to using the pre-legalized
types.
AMDGPU targets without legal i16/f16 have always used a broken ABI
that promotes to i32/f32. GlobalISel accidentally fixed this to be the
ABI we should have, but this fixes it so we're using the worse ABI
that is compatible with the DAG. Ideally we would fix the DAG to match
the old GlobalISel behavior, but I don't wish to fight that battle.
A new native GlobalISel call lowering framework should let the target
process the incoming types directly.
CCValAssigns select a "ValVT" and "LocVT" but the meanings of these
aren't entirely clear. Different targets don't use them consistently,
even within their own call lowering code. My current belief is the
intent was "ValVT" is supposed to be the legalized value type to use
in the end, and and LocVT was supposed to be the ABI passed type
(which is also legalized).
With the default CCState::Analyze functions always passing the same
type for these arguments, these only differ when the TableGen part of
the lowering decide to promote the type from one legal type to
another. AArch64's i1/i8/i16 hack ends up inverting the meanings of
these values, so I had to add an additional hack to let the target
interpret how large the argument memory is.
Since targets don't consistently interpret ValVT and LocVT, this
doesn't produce quite equivalent code to the initial DAG
lowerings. I've opted to consistently interpret LocVT as the in-memory
size for stack passed values, and ValVT as the register type to assign
from that memory. We therefore produce extending loads directly out of
the IRTranslator, whereas the DAG would emit regular loads of smaller
values. This will also produce loads/stores that are wider than the
argument value if the allocated stack slot is larger (and there will
be undef padding bytes). If we had the optimizations to reduce
load/stores based on truncated values, this wouldn't produce a
different end result.
Since ValVT/LocVT are more consistently interpreted, we now will emit
more G_BITCASTS as requested by the CCAssignFn. For example AArch64
was directly assigning types to some physical vector registers which
according to the tablegen spec should have been casted to a vector
with a different element type.
This also moves the responsibility for inserting
G_ASSERT_SEXT/G_ASSERT_ZEXT from the target ValueHandlers into the
generic code, which is closer to how SelectionDAGBuilder works.
I had to xfail an x86 test since I don't see a quick way to fix it
right now (I filed bug 50035 for this). It's broken independently of
this change, and only triggers since now we end up with more ands
which hit the improperly handled selection pattern.
I also observed that FP arguments that need promotion (e.g. f16 passed
as f32) are broken, and use regular G_TRUNC and G_ANYEXT.
TLDR; the current call lowering infrastructure is bad and nobody has
ever understood how it chooses types.
2021-04-13 13:45:35 -04:00
|
|
|
auto Hint = buildExtensionHint(VA, Copy.getReg(0), RegTy);
|
2021-03-02 17:40:50 -05:00
|
|
|
MIRBuilder.buildTrunc(ValVReg, Hint);
|
|
|
|
}
|