mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
[SimplifyIndVar] Avoid generating truncate instructions with non-hoisted Laod operand.
Differential Revision: https://reviews.llvm.org/D49151 llvm-svn: 341726
This commit is contained in:
parent
1ea0f12848
commit
58be3eed37
@ -1017,6 +1017,8 @@ protected:
|
||||
Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
|
||||
|
||||
bool widenLoopCompare(NarrowIVDefUse DU);
|
||||
bool widenWithVariantLoadUse(NarrowIVDefUse DU);
|
||||
void widenWithVariantLoadUseCodegen(NarrowIVDefUse DU);
|
||||
|
||||
void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
|
||||
};
|
||||
@ -1361,6 +1363,146 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/// If the narrow use is an instruction whose two operands are the defining
|
||||
/// instruction of DU and a load instruction, then we have the following:
|
||||
/// if the load is hoisted outside the loop, then we do not reach this function
|
||||
/// as scalar evolution analysis works fine in widenIVUse with variables
|
||||
/// hoisted outside the loop and efficient code is subsequently generated by
|
||||
/// not emitting truncate instructions. But when the load is not hoisted
|
||||
/// (whether due to limitation in alias analysis or due to a true legality),
|
||||
/// then scalar evolution can not proceed with loop variant values and
|
||||
/// inefficient code is generated. This function handles the non-hoisted load
|
||||
/// special case by making the optimization generate the same type of code for
|
||||
/// hoisted and non-hoisted load (widen use and eliminate sign extend
|
||||
/// instruction). This special case is important especially when the induction
|
||||
/// variables are affecting addressing mode in code generation.
|
||||
bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
|
||||
Instruction *NarrowUse = DU.NarrowUse;
|
||||
Instruction *NarrowDef = DU.NarrowDef;
|
||||
Instruction *WideDef = DU.WideDef;
|
||||
|
||||
// Handle the common case of add<nsw/nuw>
|
||||
const unsigned OpCode = NarrowUse->getOpcode();
|
||||
// Only Add/Sub/Mul instructions are supported.
|
||||
if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
|
||||
OpCode != Instruction::Mul)
|
||||
return false;
|
||||
|
||||
// The operand that is not defined by NarrowDef of DU. Let's call it the
|
||||
// other operand.
|
||||
unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == NarrowDef ? 1 : 0;
|
||||
assert(DU.NarrowUse->getOperand(1 - ExtendOperIdx) == DU.NarrowDef &&
|
||||
"bad DU");
|
||||
|
||||
const SCEV *ExtendOperExpr = nullptr;
|
||||
const OverflowingBinaryOperator *OBO =
|
||||
cast<OverflowingBinaryOperator>(NarrowUse);
|
||||
ExtendKind ExtKind = getExtendKind(NarrowDef);
|
||||
if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
|
||||
ExtendOperExpr = SE->getSignExtendExpr(
|
||||
SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
|
||||
else if (ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
|
||||
ExtendOperExpr = SE->getZeroExtendExpr(
|
||||
SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
|
||||
else
|
||||
return false;
|
||||
|
||||
// We are interested in the other operand being a load instruction.
|
||||
// But, we should look into relaxing this restriction later on.
|
||||
auto *I = dyn_cast<Instruction>(NarrowUse->getOperand(ExtendOperIdx));
|
||||
if (I && I->getOpcode() != Instruction::Load)
|
||||
return false;
|
||||
|
||||
// Verifying that Defining operand is an AddRec
|
||||
const SCEV *Op1 = SE->getSCEV(WideDef);
|
||||
const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
|
||||
if (!AddRecOp1 || AddRecOp1->getLoop() != L)
|
||||
return false;
|
||||
// Verifying that other operand is an Extend.
|
||||
if (ExtKind == SignExtended) {
|
||||
if (!isa<SCEVSignExtendExpr>(ExtendOperExpr))
|
||||
return false;
|
||||
} else {
|
||||
if (!isa<SCEVZeroExtendExpr>(ExtendOperExpr))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ExtKind == SignExtended) {
|
||||
for (Use &U : NarrowUse->uses()) {
|
||||
SExtInst *User = dyn_cast<SExtInst>(U.getUser());
|
||||
if (!User || User->getType() != WideType)
|
||||
return false;
|
||||
}
|
||||
} else { // ExtKind == ZeroExtended
|
||||
for (Use &U : NarrowUse->uses()) {
|
||||
ZExtInst *User = dyn_cast<ZExtInst>(U.getUser());
|
||||
if (!User || User->getType() != WideType)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Special Case for widening with variant Loads (see
|
||||
/// WidenIV::widenWithVariantLoadUse). This is the code generation part.
|
||||
void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) {
|
||||
Instruction *NarrowUse = DU.NarrowUse;
|
||||
Instruction *NarrowDef = DU.NarrowDef;
|
||||
Instruction *WideDef = DU.WideDef;
|
||||
|
||||
ExtendKind ExtKind = getExtendKind(NarrowDef);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
|
||||
|
||||
// Generating a widening use instruction.
|
||||
Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
|
||||
? WideDef
|
||||
: createExtendInst(NarrowUse->getOperand(0), WideType,
|
||||
ExtKind, NarrowUse);
|
||||
Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
|
||||
? WideDef
|
||||
: createExtendInst(NarrowUse->getOperand(1), WideType,
|
||||
ExtKind, NarrowUse);
|
||||
|
||||
auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
|
||||
auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
|
||||
NarrowBO->getName());
|
||||
IRBuilder<> Builder(NarrowUse);
|
||||
Builder.Insert(WideBO);
|
||||
WideBO->copyIRFlags(NarrowBO);
|
||||
|
||||
if (ExtKind == SignExtended)
|
||||
ExtendKindMap[NarrowUse] = SignExtended;
|
||||
else
|
||||
ExtendKindMap[NarrowUse] = ZeroExtended;
|
||||
|
||||
// Update the Use.
|
||||
if (ExtKind == SignExtended) {
|
||||
for (Use &U : NarrowUse->uses()) {
|
||||
SExtInst *User = dyn_cast<SExtInst>(U.getUser());
|
||||
if (User && User->getType() == WideType) {
|
||||
LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
|
||||
<< *WideBO << "\n");
|
||||
++NumElimExt;
|
||||
User->replaceAllUsesWith(WideBO);
|
||||
DeadInsts.emplace_back(User);
|
||||
}
|
||||
}
|
||||
} else { // ExtKind == ZeroExtended
|
||||
for (Use &U : NarrowUse->uses()) {
|
||||
ZExtInst *User = dyn_cast<ZExtInst>(U.getUser());
|
||||
if (User && User->getType() == WideType) {
|
||||
LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
|
||||
<< *WideBO << "\n");
|
||||
++NumElimExt;
|
||||
User->replaceAllUsesWith(WideBO);
|
||||
DeadInsts.emplace_back(User);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine whether an individual user of the narrow IV can be widened. If so,
|
||||
/// return the wide clone of the user.
|
||||
Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
|
||||
@ -1458,6 +1600,16 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
|
||||
if (widenLoopCompare(DU))
|
||||
return nullptr;
|
||||
|
||||
// We are here about to generate a truncate instruction that may hurt
|
||||
// performance because the scalar evolution expression computed earlier
|
||||
// in WideAddRec.first does not indicate a polynomial induction expression.
|
||||
// In that case, look at the operands of the use instruction to determine
|
||||
// if we can still widen the use instead of truncating its operand.
|
||||
if (widenWithVariantLoadUse(DU)) {
|
||||
widenWithVariantLoadUseCodegen(DU);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// This user does not evaluate to a recurrence after widening, so don't
|
||||
// follow it. Instead insert a Trunc to kill off the original use,
|
||||
// eventually isolating the original narrow IV so it can be removed.
|
||||
|
@ -273,3 +273,87 @@ for.end: ; preds = %for.cond.for.end_cr
|
||||
%call = call i32 @dummy(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([100 x i32], [100 x i32]* @b, i32 0, i32 0))
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
%struct.image = type {i32, i32}
|
||||
define i32 @foo4(%struct.image* %input, i32 %length, i32* %in) {
|
||||
entry:
|
||||
%stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1
|
||||
%0 = load i32, i32* %stride, align 4
|
||||
%cmp17 = icmp sgt i32 %length, 1
|
||||
br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
|
||||
|
||||
for.body.lr.ph: ; preds = %entry
|
||||
%channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit: ; preds = %for.body
|
||||
%1 = phi i32 [ %6, %for.body ]
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
||||
%2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ]
|
||||
ret i32 %2
|
||||
|
||||
; mul instruction below is widened instead of generating a truncate instruction for it
|
||||
; regardless if Load operand of mul is inside or outside the loop (we have both cases).
|
||||
; CHECK: for.body:
|
||||
; CHECK-NOT: trunc
|
||||
for.body: ; preds = %for.body.lr.ph, %for.body
|
||||
%x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
|
||||
%add = add nuw nsw i32 %x.018, 1
|
||||
%3 = load i32, i32* %channel, align 8
|
||||
%mul = mul nsw i32 %3, %add
|
||||
%idx.ext = sext i32 %mul to i64
|
||||
%add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext
|
||||
%4 = load i32, i32* %add.ptr, align 4
|
||||
%mul1 = mul nsw i32 %0, %add
|
||||
%idx.ext1 = sext i32 %mul1 to i64
|
||||
%add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1
|
||||
%5 = load i32, i32* %add.ptr1, align 4
|
||||
%6 = add i32 %4, %5
|
||||
%cmp = icmp slt i32 %add, %length
|
||||
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
|
||||
}
|
||||
|
||||
|
||||
define i32 @foo5(%struct.image* %input, i32 %length, i32* %in) {
|
||||
entry:
|
||||
%stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1
|
||||
%0 = load i32, i32* %stride, align 4
|
||||
%cmp17 = icmp sgt i32 %length, 1
|
||||
br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
|
||||
|
||||
for.body.lr.ph: ; preds = %entry
|
||||
%channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit: ; preds = %for.body
|
||||
%1 = phi i32 [ %7, %for.body ]
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
||||
%2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ]
|
||||
ret i32 %2
|
||||
|
||||
; This example is the same as above except that the first mul is used in two places
|
||||
; and this may result in having two versions of the multiply: an i32 and i64 version.
|
||||
; In this case, keep the trucate instructions to avoid this redundancy.
|
||||
; CHECK: for.body:
|
||||
; CHECK: trunc
|
||||
for.body: ; preds = %for.body.lr.ph, %for.body
|
||||
%x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
|
||||
%add = add nuw nsw i32 %x.018, 1
|
||||
%3 = load i32, i32* %channel, align 8
|
||||
%mul = mul nsw i32 %3, %add
|
||||
%idx.ext = sext i32 %mul to i64
|
||||
%add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext
|
||||
%4 = load i32, i32* %add.ptr, align 4
|
||||
%mul1 = mul nsw i32 %0, %add
|
||||
%idx.ext1 = sext i32 %mul1 to i64
|
||||
%add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1
|
||||
%5 = load i32, i32* %add.ptr1, align 4
|
||||
%6 = add i32 %4, %5
|
||||
%7 = add i32 %6, %mul
|
||||
%cmp = icmp slt i32 %add, %length
|
||||
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user