mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 02:52:53 +02:00
[AArch64] Add new subtarget feature to fold LSL into address mode.
Summary: This feature enables folding of logical shift operations of up to 3 places into addressing mode on Kryo and Falkor that have a fastpath LSL. Reviewers: mcrosier, rengolin, t.p.northover Subscribers: junbuml, gberry, llvm-commits, aemerson Differential Revision: https://reviews.llvm.org/D31113 llvm-svn: 299240
This commit is contained in:
parent
cf9e1adbff
commit
127d0e73d2
@ -126,6 +126,9 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
|
||||
"equivalent when the immediate does "
|
||||
"not fit in the encoding.">;
|
||||
|
||||
def FeatureLSLFast : SubtargetFeature<
|
||||
"lsl-fast", "HasLSLFast", "true",
|
||||
"CPU has a fastpath logical shift of up to 3 places">;
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Architectures.
|
||||
//
|
||||
@ -279,7 +282,8 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
|
||||
FeaturePerfMon,
|
||||
FeaturePostRAScheduler,
|
||||
FeaturePredictableSelectIsExpensive,
|
||||
FeatureZCZeroing
|
||||
FeatureZCZeroing,
|
||||
FeatureLSLFast
|
||||
]>;
|
||||
|
||||
def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
|
||||
@ -293,7 +297,8 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
|
||||
FeaturePostRAScheduler,
|
||||
FeaturePredictableSelectIsExpensive,
|
||||
FeatureRDM,
|
||||
FeatureZCZeroing
|
||||
FeatureZCZeroing,
|
||||
FeatureLSLFast
|
||||
]>;
|
||||
|
||||
def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
|
||||
|
@ -328,11 +328,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Determine whether it is worth it to fold SHL into the addressing
|
||||
/// mode.
|
||||
static bool isWorthFoldingSHL(SDValue V) {
|
||||
assert(V.getOpcode() == ISD::SHL && "invalid opcode");
|
||||
// It is worth folding logical shift of up to three places.
|
||||
auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
|
||||
if (!CSD)
|
||||
return false;
|
||||
unsigned ShiftVal = CSD->getZExtValue();
|
||||
if (ShiftVal > 3)
|
||||
return false;
|
||||
|
||||
// Check if this particular node is reused in any non-memory related
|
||||
// operation. If yes, do not try to fold this node into the address
|
||||
// computation, since the computation will be kept.
|
||||
const SDNode *Node = V.getNode();
|
||||
for (SDNode *UI : Node->uses())
|
||||
if (!isa<MemSDNode>(*UI))
|
||||
for (SDNode *UII : UI->uses())
|
||||
if (!isa<MemSDNode>(*UII))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Determine whether it is worth to fold V into an extended register.
|
||||
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
|
||||
// it hurts if the value is used at least twice, unless we are optimizing
|
||||
// for code size.
|
||||
return ForCodeSize || V.hasOneUse();
|
||||
// Trivial if we are optimizing for code size or if there is only
|
||||
// one use of the value.
|
||||
if (ForCodeSize || V.hasOneUse())
|
||||
return true;
|
||||
// If a subtarget has a fastpath LSL we can fold a logical shift into
|
||||
// the addressing mode and save a cycle.
|
||||
if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
|
||||
isWorthFoldingSHL(V))
|
||||
return true;
|
||||
if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
|
||||
const SDValue LHS = V.getOperand(0);
|
||||
const SDValue RHS = V.getOperand(1);
|
||||
if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
|
||||
return true;
|
||||
if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
|
||||
return true;
|
||||
}
|
||||
|
||||
// It hurts otherwise, since the value will be reused.
|
||||
return false;
|
||||
}
|
||||
|
||||
/// SelectShiftedRegister - Select a "shifted register" operand. If the value
|
||||
|
@ -69,6 +69,7 @@ protected:
|
||||
bool HasPerfMon = false;
|
||||
bool HasFullFP16 = false;
|
||||
bool HasSPE = false;
|
||||
bool HasLSLFast = false;
|
||||
|
||||
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
|
||||
bool HasZeroCycleRegMove = false;
|
||||
@ -232,6 +233,7 @@ public:
|
||||
bool hasPerfMon() const { return HasPerfMon; }
|
||||
bool hasFullFP16() const { return HasFullFP16; }
|
||||
bool hasSPE() const { return HasSPE; }
|
||||
bool hasLSLFast() const { return HasLSLFast; }
|
||||
|
||||
bool isLittleEndian() const { return IsLittle; }
|
||||
|
||||
|
74
test/CodeGen/AArch64/aarch64-fold-lslfast.ll
Normal file
74
test/CodeGen/AArch64/aarch64-fold-lslfast.ll
Normal file
@ -0,0 +1,74 @@
|
||||
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s
|
||||
|
||||
%struct.a = type [256 x i16]
|
||||
%struct.b = type [256 x i32]
|
||||
%struct.c = type [256 x i64]
|
||||
|
||||
declare void @foo()
|
||||
define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind {
|
||||
; CHECK-LABEL: halfword:
|
||||
; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
|
||||
; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1]
|
||||
; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1]
|
||||
%shr81 = lshr i32 %xor72, 9
|
||||
%conv82 = zext i32 %shr81 to i64
|
||||
%idxprom83 = and i64 %conv82, 255
|
||||
%arrayidx86 = getelementptr inbounds %struct.a, %struct.a* %ctx, i64 0, i64 %idxprom83
|
||||
%result = load i16, i16* %arrayidx86, align 2
|
||||
call void @foo()
|
||||
store i16 %result, i16* %arrayidx86, align 2
|
||||
ret i16 %result
|
||||
}
|
||||
|
||||
define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind {
|
||||
; CHECK-LABEL: word:
|
||||
; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
|
||||
; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2]
|
||||
; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2]
|
||||
%shr81 = lshr i32 %xor72, 9
|
||||
%conv82 = zext i32 %shr81 to i64
|
||||
%idxprom83 = and i64 %conv82, 255
|
||||
%arrayidx86 = getelementptr inbounds %struct.b, %struct.b* %ctx, i64 0, i64 %idxprom83
|
||||
%result = load i32, i32* %arrayidx86, align 4
|
||||
call void @foo()
|
||||
store i32 %result, i32* %arrayidx86, align 4
|
||||
ret i32 %result
|
||||
}
|
||||
|
||||
define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
|
||||
; CHECK-LABEL: doubleword:
|
||||
; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
|
||||
; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3]
|
||||
; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3]
|
||||
%shr81 = lshr i32 %xor72, 9
|
||||
%conv82 = zext i32 %shr81 to i64
|
||||
%idxprom83 = and i64 %conv82, 255
|
||||
%arrayidx86 = getelementptr inbounds %struct.c, %struct.c* %ctx, i64 0, i64 %idxprom83
|
||||
%result = load i64, i64* %arrayidx86, align 8
|
||||
call void @foo()
|
||||
store i64 %result, i64* %arrayidx86, align 8
|
||||
ret i64 %result
|
||||
}
|
||||
|
||||
define i64 @multi_use_non_memory(i64 %a, i64 %b) {
|
||||
; CHECK-LABEL: multi_use_non_memory:
|
||||
; CHECK: lsl [[REG1:x[0-9]+]], x0, #3
|
||||
; CHECK-NOT: cmp [[REG1]], x1, lsl # 3
|
||||
; CHECK-NEXT: lsl [[REG2:x[0-9]+]], x1, #3
|
||||
; CHECK-NEXT: cmp [[REG1]], [[REG2]]
|
||||
entry:
|
||||
%mul1 = shl i64 %a, 3
|
||||
%mul2 = shl i64 %b, 3
|
||||
%cmp = icmp slt i64 %mul1, %mul2
|
||||
br i1 %cmp, label %truebb, label %falsebb
|
||||
truebb:
|
||||
tail call void @foo()
|
||||
unreachable
|
||||
falsebb:
|
||||
%cmp2 = icmp sgt i64 %mul1, %mul2
|
||||
br i1 %cmp2, label %exitbb, label %endbb
|
||||
exitbb:
|
||||
ret i64 %mul1
|
||||
endbb:
|
||||
ret i64 %mul2
|
||||
}
|
Loading…
Reference in New Issue
Block a user