1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 19:42:54 +02:00
llvm-mirror/lib/Target/ARM/ARMSelectionDAGInfo.cpp
Scott Douglass 9091dc1c0e [ARM] Modify codegen for memcpy intrinsic to prefer LDM/STM.
We were previously codegen'ing memcpy as regular load/store operations and
hoping that the register allocator would allocate registers in ascending order
so that we could apply an LDM/STM combine after register allocation. According
to the commit that first introduced this code (r37179), we planned to teach the
register allocator to allocate the registers in ascending order. This never got
implemented, and up to now we've been stuck with very poor codegen.

A much simpler approach for achieving better codegen is to create MEMCPY pseudo
instructions, attach scratch virtual registers to them and then, post register
allocation, expand the MEMCPYs into LDM/STM pairs using the scratch registers.
The register allocator will have picked arbitrary registers which we sort when
expanding the MEMCPY. This approach also avoids the need to repeatedly calculate
offsets which ultimately ought to be eliminated pre-RA in order to decrease
register pressure.

Fixes PR9199 and PR23768.

[This is based on Peter Collingbourne's r238473 which was reverted.]

Differential Revision: http://reviews.llvm.org/D13239

Change-Id: I727543c2e94136e0f80b8e22d5642d7b9ee5b458
Author: Peter Collingbourne <peter@pcc.me.uk>
llvm-svn: 249322
2015-10-05 14:49:54 +00:00

272 lines
9.4 KiB
C++

//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the ARMSelectionDAGInfo class.
//
//===----------------------------------------------------------------------===//
#include "ARMTargetMachine.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
using namespace llvm;
#define DEBUG_TYPE "arm-selectiondag-info"
// Emit, if possible, a specialized version of the given Libcall. Typically this
// means selecting the appropriately aligned version, but we also convert memset
// of 0 into memclr.
SDValue ARMSelectionDAGInfo::
EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
RTLIB::Libcall LC) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
// Only use a specialized AEABI function if the default version of this
// Libcall is an AEABI function.
if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
return SDValue();
// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
// able to translate memset to memclr and use the value to index the function
// name array.
enum {
AEABI_MEMCPY = 0,
AEABI_MEMMOVE,
AEABI_MEMSET,
AEABI_MEMCLR
} AEABILibcall;
switch (LC) {
case RTLIB::MEMCPY:
AEABILibcall = AEABI_MEMCPY;
break;
case RTLIB::MEMMOVE:
AEABILibcall = AEABI_MEMMOVE;
break;
case RTLIB::MEMSET:
AEABILibcall = AEABI_MEMSET;
if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
if (ConstantSrc->getZExtValue() == 0)
AEABILibcall = AEABI_MEMCLR;
break;
default:
return SDValue();
}
// Choose the most-aligned libcall variant that we can
enum {
ALIGN1 = 0,
ALIGN4,
ALIGN8
} AlignVariant;
if ((Align & 7) == 0)
AlignVariant = ALIGN8;
else if ((Align & 3) == 0)
AlignVariant = ALIGN4;
else
AlignVariant = ALIGN1;
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
Entry.Node = Dst;
Args.push_back(Entry);
if (AEABILibcall == AEABI_MEMCLR) {
Entry.Node = Size;
Args.push_back(Entry);
} else if (AEABILibcall == AEABI_MEMSET) {
// Adjust parameters for memset, EABI uses format (ptr, size, value),
// GNU library uses (ptr, value, size)
// See RTABI section 4.3.4
Entry.Node = Size;
Args.push_back(Entry);
// Extend or truncate the argument to be an i32 value for the call.
if (Src.getValueType().bitsGT(MVT::i32))
Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
else if (Src.getValueType().bitsLT(MVT::i32))
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
Entry.Node = Src;
Entry.Ty = Type::getInt32Ty(*DAG.getContext());
Entry.isSExt = false;
Args.push_back(Entry);
} else {
Entry.Node = Src;
Args.push_back(Entry);
Entry.Node = Size;
Args.push_back(Entry);
}
char const *FunctionNames[4][3] = {
{ "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
{ "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
{ "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
{ "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
};
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setCallee(
TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
TLI->getPointerTy(DAG.getDataLayout())),
std::move(Args), 0)
.setDiscardResult();
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
// Do repeated 4-byte loads and stores. To be improved.
// This requires 4-byte alignment.
if ((Align & 3) != 0)
return SDValue();
// This requires the copy size to be a constant, preferably
// within a subtarget-specific limit.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (!ConstantSize)
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMCPY);
uint64_t SizeVal = ConstantSize->getZExtValue();
if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMCPY);
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
unsigned EmittedNumMemOps = 0;
EVT VT = MVT::i32;
unsigned VTSize = 4;
unsigned i = 0;
// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
SDValue TFOps[6];
SDValue Loads[6];
uint64_t SrcOff = 0, DstOff = 0;
// FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
// VLDM/VSTM and make this code emit it when appropriate. This would reduce
// pressure on the general purpose registers. However this seems harder to map
// onto the register allocator's view of the world.
// The number of MEMCPY pseudo-instructions to emit. We use up to
// MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
// later on. This is a lower bound on the number of MEMCPY operations we must
// emit.
unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
for (unsigned I = 0; I != NumMEMCPYs; ++I) {
// Evenly distribute registers among MEMCPY operations to reduce register
// pressure.
unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
DAG.getConstant(NumRegs, dl, MVT::i32));
Src = Dst.getValue(1);
Chain = Dst.getValue(2);
DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
EmittedNumMemOps = NextEmittedNumMemOps;
}
if (BytesLeft == 0)
return Chain;
// Issue loads / stores for the trailing (1 - 3) bytes.
unsigned BytesLeftSave = BytesLeft;
i = 0;
while (BytesLeft) {
if (BytesLeft >= 2) {
VT = MVT::i16;
VTSize = 2;
} else {
VT = MVT::i8;
VTSize = 1;
}
Loads[i] = DAG.getLoad(VT, dl, Chain,
DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
DAG.getConstant(SrcOff, dl, MVT::i32)),
SrcPtrInfo.getWithOffset(SrcOff),
false, false, false, 0);
TFOps[i] = Loads[i].getValue(1);
++i;
SrcOff += VTSize;
BytesLeft -= VTSize;
}
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(TFOps, i));
i = 0;
BytesLeft = BytesLeftSave;
while (BytesLeft) {
if (BytesLeft >= 2) {
VT = MVT::i16;
VTSize = 2;
} else {
VT = MVT::i8;
VTSize = 1;
}
TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
DAG.getConstant(DstOff, dl, MVT::i32)),
DstPtrInfo.getWithOffset(DstOff), false, false, 0);
++i;
DstOff += VTSize;
BytesLeft -= VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(TFOps, i));
}
SDValue ARMSelectionDAGInfo::
EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
bool isVolatile,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const {
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMMOVE);
}
SDValue ARMSelectionDAGInfo::
EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
SDValue Chain, SDValue Dst,
SDValue Src, SDValue Size,
unsigned Align, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
RTLIB::MEMSET);
}