1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

Correct load-op-store cycle detection analysis

Add missing cycle dependency checks in load-op-store fusion.

Fixes PR36274.

Reviewers: craig.topper, bogner

Subscribers: hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D43154

llvm-svn: 327172
This commit is contained in:
Nirav Dave 2018-03-09 20:58:07 +00:00
parent f402fe9644
commit 6d554c47b0
2 changed files with 98 additions and 42 deletions

View File

@ -2109,27 +2109,56 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoadNode->getOffset() != StoreNode->getOffset())
return false;
// Check if the chain is produced by the load or is a TokenFactor with
// the load output chain as an operand. Return InputChain by reference.
SDValue Chain = StoreNode->getChain();
if (Chain == Load.getValue(1)) {
InputChain = LoadNode->getChain();
return true;
}
if (Chain.getOpcode() == ISD::TokenFactor) {
// Fusing Load-Op-Store requires predecessors of store must also
// be predecessors of the load. This addition may cause a loop. We
// can check this by doing a search for Load in the new
// dependencies. As this can be expensive, heuristically prune
// this search by visiting the uses and make sure they all have
// smaller node id than the load.
bool FoundLoad = false;
SmallVector<SDValue, 4> ChainOps;
SmallVector<const SDNode *, 4> LoopWorklist;
SmallPtrSet<const SDNode *, 16> Visited;
const unsigned int Max = 1024;
// Visualization of Load-Op-Store fusion:
// -------------------------
// Legend:
// *-lines = Chain operand dependencies.
// |-lines = Normal operand dependencies.
// Dependencies flow down and right. n-suffix references multiple nodes.
//
// C Xn C
// * * *
// * * *
// Xn A-LD Yn TF Yn
// * * \ | * |
// * * \ | * |
// * * \ | => A--LD_OP_ST
// * * \| \
// TF OP \
// * | \ Zn
// * | \
// A-ST Zn
//
// This merge induced dependences from: #1: Xn -> LD, OP, Zn
// #2: Yn -> LD
// #3: ST -> Zn
// Ensure the transform is safe by checking for the dual
// dependencies to make sure we do not induce a loop.
// As LD is a predecessor to both OP and ST we can do this by checking:
// a). if LD is a predecessor to a member of Xn or Yn.
// b). if a Zn is a predecessor to ST.
// However, (b) can only occur through being a chain predecessor to
// ST, which is the same as Zn being a member or predecessor of Xn,
// which is a subset of LD being a predecessor of Xn. So it's
// subsumed by check (a).
SDValue Chain = StoreNode->getChain();
// Gather X elements in ChainOps.
if (Chain == Load.getValue(1)) {
FoundLoad = true;
ChainOps.push_back(Load.getOperand(0));
} else if (Chain.getOpcode() == ISD::TokenFactor) {
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
@ -2141,31 +2170,25 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
}
if (!FoundLoad)
return false;
// If Loop Worklist is not empty. Check if we would make a loop.
if (!LoopWorklist.empty()) {
const unsigned int Max = 8192;
// if Load is predecessor to potentially loop inducing chain
// dependencies.
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist,
Max, true))
return false;
// Fail conservatively if we ended loop search early.
if (Visited.size() >= Max)
return false;
}
// Worklist is currently Xn. Add Yn to worklist.
for (SDValue Op : StoredVal->ops())
if (Op.getNode() != LoadNode)
LoopWorklist.push_back(Op.getNode());
// Check (a) if Load is a predecessor to Xn + Yn
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
true))
return false;
// Make a new TokenFactor with all the other input chains except
// for the load.
InputChain =
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
}
return false;
}
// Change a chain of {load; op; store} of the same value into a simple op
// through memory of that value, if the uses of the modified value and its

View File

@ -0,0 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
; This tests is checking for a case where the x86 load-op-store fusion
; misses a dependence between the fused load and a non-fused operand
; to the load causing a cycle. Here the dependence in question comes
; from the carry in input of the adcl.
@vx = external local_unnamed_addr global <2 x i32>, align 8
define void @pr36274(i32* %somewhere) {
; CHECK-LABEL: pr36274:
; CHECK: # %bb.0:
; CHECK-NEXT: movl vx+4, %eax
; CHECK-NEXT: addl $1, vx
; CHECK-NEXT: adcl $0, %eax
; CHECK-NEXT: movl %eax, vx+4
; CHECK-NEXT: retl
%a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0
%a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1
%x1 = load volatile i32, i32* %a1, align 4
%x0 = load volatile i32, i32* %a0, align 8
%vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0
%vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1
%x = bitcast <2 x i32> %vx1 to i64
%add = add i64 %x, 1
%vadd = bitcast i64 %add to <2 x i32>
%vx1_0 = extractelement <2 x i32> %vadd, i32 0
%vx1_1 = extractelement <2 x i32> %vadd, i32 1
store i32 %vx1_0, i32* %a0, align 8
store i32 %vx1_1, i32* %a1, align 4
ret void
}