1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate

insertions.

The old behavior could cause arbitrarily bad memory usage in the DAG
combiner if there was heavy traffic of adding nodes already on the
worklist to it. This commit switches the DAG combine worklist to work
the same way as the instcombine worklist where we null-out removed
entries and only add new entries to the worklist. My measurements of
codegen time shows slight improvement. The memory utilization is
unsurprisingly dominated by other factors (the IR and DAG itself
I suspect).

This change results in subtle, frustrating churn in the particular order
in which DAG combines are applied which causes a number of minor
regressions where we fail to match a pattern previously matched by
accident. AFAICT, all of these should be using AddToWorklist to directly
or should be written in a less brittle way. None of the changes seem
drastically bad, and a few of the changes seem distinctly better.

A major change required to make this work is to significantly harden the
way in which the DAG combiner handle nodes which become dead
(zero-uses). Previously, we relied on the ability to "priority-bump"
them on the combine worklist to achieve recursive deletion of these
nodes and ensure that the frontier of remaining live nodes all were
added to the worklist. Instead, I've introduced a routine to just
implement that precise logic with no indirection. It is a significantly
simpler operation than that of the combiner worklist proper. I suspect
this will also fix some other problems with the combiner.

I think the x86 changes are really minor and uninteresting, but the
avx512 change at least is hiding a "regression" (despite the test case
being just noise, not testing some performance invariant) that might be
looked into. Not sure if any of the others impact specific "important"
code paths, but they didn't look terribly interesting to me, or the
changes were really minor. The consensus in review is to fix any
regressions that show up after the fact here.

Thanks to the other reviewers for checking the output on other
architectures. There is a specific regression on ARM that Tim already
has a fix prepped to commit.

Differential Revision: http://reviews.llvm.org/D4616

llvm-svn: 213727
This commit is contained in:
Chandler Carruth 2014-07-23 07:08:53 +00:00
parent 08dae2c274
commit 908e62868c
16 changed files with 93 additions and 273 deletions

View File

@ -18,6 +18,7 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@ -87,25 +88,21 @@ namespace {
bool LegalTypes;
bool ForCodeSize;
// Worklist of all of the nodes that need to be simplified.
//
// This has the semantics that when adding to the worklist,
// the item added must be next to be processed. It should
// also only appear once. The naive approach to this takes
// linear time.
//
// To reduce the insert/remove time to logarithmic, we use
// a set and a vector to maintain our worklist.
//
// The set contains the items on the worklist, but does not
// maintain the order they should be visited.
//
// The vector maintains the order nodes should be visited, but may
// contain duplicate or removed nodes. When choosing a node to
// visit, we pop off the order stack until we find an item that is
// also in the contents set. All operations are O(log N).
SmallPtrSet<SDNode*, 64> WorklistContents;
SmallVector<SDNode*, 64> WorklistOrder;
/// \brief Worklist of all of the nodes that need to be simplified.
///
/// This must behave as a stack -- new nodes to process are pushed onto the
/// back and when processing we pop off of the back.
///
/// The worklist will not contain duplicates but may contain null entries
/// due to nodes being deleted from the underlying DAG.
SmallVector<SDNode *, 64> Worklist;
/// \brief Mapping from an SDNode to its position on the worklist.
///
/// This is used to find and remove nodes from the worklist (by nulling
/// them) when they are deleted from the underlying DAG. It relies on
/// stable indices of nodes within the worklist.
DenseMap<SDNode *, unsigned> WorklistMap;
// AA - Used for DAG load/store alias analysis.
AliasAnalysis &AA;
@ -132,16 +129,24 @@ namespace {
if (N->getOpcode() == ISD::HANDLENODE)
return;
WorklistContents.insert(N);
WorklistOrder.push_back(N);
if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
Worklist.push_back(N);
}
/// removeFromWorklist - remove all instances of N from the worklist.
///
void removeFromWorklist(SDNode *N) {
WorklistContents.erase(N);
auto It = WorklistMap.find(N);
if (It == WorklistMap.end())
return; // Not in the worklist.
// Null out the entry rather than erasing it to avoid a linear operation.
Worklist[It->second] = nullptr;
WorklistMap.erase(It);
}
bool recursivelyDeleteUnusedNodes(SDNode *N);
SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
bool AddTo = true);
@ -1072,6 +1077,35 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
return false;
}
/// \brief Recursively delete a node which has no uses and any operands for
/// which it is the only use.
///
/// Note that this both deletes the nodes and removes them from the worklist.
/// It also adds any nodes who have had a user deleted to the worklist as they
/// may now have only one use and subject to other combines.
bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
if (!N->use_empty())
return false;
SmallSetVector<SDNode *, 16> Nodes;
Nodes.insert(N);
do {
N = Nodes.pop_back_val();
if (!N)
continue;
if (N->use_empty()) {
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
Nodes.insert(N->getOperand(i).getNode());
removeFromWorklist(N);
DAG.DeleteNode(N);
} else {
AddToWorklist(N);
}
} while (!Nodes.empty());
return true;
}
//===----------------------------------------------------------------------===//
// Main DAG Combiner implementation
@ -1099,27 +1133,25 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
// while the worklist isn't empty, find a node and
// try and combine it.
while (!WorklistContents.empty()) {
while (!WorklistMap.empty()) {
SDNode *N;
// The WorklistOrder holds the SDNodes in order, but it may contain
// duplicates.
// In order to avoid a linear scan, we use a set (O(log N)) to hold what the
// worklist *should* contain, and check the node we want to visit is should
// actually be visited.
// The Worklist holds the SDNodes in order, but it may contain null entries.
do {
N = WorklistOrder.pop_back_val();
} while (!WorklistContents.erase(N));
N = Worklist.pop_back_val();
} while (!N);
bool GoodWorklistEntry = WorklistMap.erase(N);
(void)GoodWorklistEntry;
assert(GoodWorklistEntry &&
"Found a worklist entry without a corresponding map entry!");
// If N has no uses, it is dead. Make sure to revisit all N's operands once
// N is deleted from the DAG, since they too may now be dead or may have a
// reduced number of uses, allowing other xforms.
if (N->use_empty()) {
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
AddToWorklist(N->getOperand(i).getNode());
DAG.DeleteNode(N);
if (recursivelyDeleteUnusedNodes(N))
continue;
}
WorklistRemover DeadNodes(*this);
SDValue RV = combine(N);
@ -1147,7 +1179,6 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
// Transfer debug value.
DAG.TransferDbgValues(SDValue(N, 0), RV);
WorklistRemover DeadNodes(*this);
if (N->getNumValues() == RV.getNode()->getNumValues())
DAG.ReplaceAllUsesWith(N, RV.getNode());
else {
@ -1161,23 +1192,11 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
AddToWorklist(RV.getNode());
AddUsersToWorklist(RV.getNode());
// Add any uses of the old node to the worklist in case this node is the
// last one that uses them. They may become dead after this node is
// deleted.
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
AddToWorklist(N->getOperand(i).getNode());
// Finally, if the node is now dead, remove it from the graph. The node
// may not be dead if the replacement process recursively simplified to
// something else needing this node.
if (N->use_empty()) {
// Nodes can be reintroduced into the worklist. Make sure we do not
// process a node that has been replaced.
removeFromWorklist(N);
// Finally, since the node is now dead, remove it from the graph.
DAG.DeleteNode(N);
}
// something else needing this node. This will also take care of adding any
// operands which have lost a user to the worklist.
recursivelyDeleteUnusedNodes(N);
}
// If the root changed (e.g. it was a dead load, update the root).

View File

@ -167,9 +167,9 @@ end:
define void @test_varsize(...) minsize {
; CHECK-T1-LABEL: test_varsize:
; CHECK-T1: sub sp, #16
; CHECK-T1: push {r2, r3, r4, r5, r7, lr}
; CHECK-T1: push {r5, r6, r7, lr}
; ...
; CHECK-T1: pop {r2, r3, r4, r5, r7}
; CHECK-T1: pop {r2, r3, r7}
; CHECK-T1: pop {r3}
; CHECK-T1: add sp, #16
; CHECK-T1: bx r3

View File

@ -9,7 +9,8 @@ define i32 @test0(i8 %A) {
define signext i8 @test1(i32 %A) {
; CHECK: test1
; CHECK: sxtb r0, r0, ror #8
; CHECK: lsr r0, r0, #8
; CHECK: sxtb r0, r0
%B = lshr i32 %A, 8
%C = shl i32 %A, 24
%D = or i32 %B, %C

View File

@ -24,10 +24,10 @@ entry:
}
; CHECK-LABEL: foo:
; CHECK: lfd 3
; CHECK: lfd 4
; CHECK: lfd 1
; CHECK: lfd 2
; CHECK: lfd 3
; CHECK: lfd 4
define { float, float } @oof() nounwind {
entry:

View File

@ -35,7 +35,7 @@ if.then9.i39: ; preds = %if.end7.i37
br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
; CHECK: .LBB0_7:
; CHECK: beq 1, .LBB0_10
; CHECK: bne 1, .LBB0_10
; CHECK: beq 0, .LBB0_10
; CHECK: .LBB0_9:

View File

@ -3,9 +3,9 @@
;CHECK: EXPORT T{{[0-9]}}.XYZW
;CHECK: EXPORT T{{[0-9]}}.0000
;CHECK: EXPORT T{{[0-9]}}.0000
;CHECK: EXPORT T{{[0-9]}}.0XZW
;CHECK: EXPORT T{{[0-9]}}.0XYZ
;CHECK: EXPORT T{{[0-9]}}.XYZW
;CHECK: EXPORT T{{[0-9]}}.YX00
;CHECK: EXPORT T{{[0-9]}}.YZ00
;CHECK: EXPORT T{{[0-9]}}.0000
;CHECK: EXPORT T{{[0-9]}}.0000

View File

@ -94,7 +94,7 @@ main_body:
; EG-CHECK: @main2
; EG-CHECK: T{{[0-9]+}}.XY__
; EG-CHECK: T{{[0-9]+}}.YXZ0
; EG-CHECK: T{{[0-9]+}}.ZXY0
define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
main_body:

View File

@ -10,7 +10,8 @@ define i32 @test0(i8 %A) {
define signext i8 @test1(i32 %A) {
; CHECK: test1
; CHECK: sxtb.w r0, r0, ror #8
; CHECK: lsrs r0, r0, #8
; CHECK: sxtb r0, r0
%B = lshr i32 %A, 8
%C = shl i32 %A, 24
%D = or i32 %B, %C

View File

@ -25,7 +25,7 @@ define zeroext i32 @test2(i32 %A.u, i32 %B.u) {
define zeroext i32 @test3(i32 %A.u) {
; A8: test3
; A8: uxth.w r0, r0, ror #8
; A8: ubfx r0, r0, #8, #16
%B.u = lshr i32 %A.u, 8
%C.u = shl i32 %A.u, 24
%D.u = or i32 %B.u, %C.u

View File

@ -1,14 +0,0 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
define <8 x i16> @test_zext_load() {
; CHECK: vmovq
entry:
%0 = load <2 x i16> ** undef, align 8
%1 = getelementptr inbounds <2 x i16>* %0, i64 1
%2 = load <2 x i16>* %0, align 1
%3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%4 = load <2 x i16>* %1, align 1
%5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %6
}

View File

@ -237,44 +237,6 @@ exit:
ret i32 %base
}
define void @test_loop_rotate_reversed_blocks() {
; This test case (greatly reduced from an Olden bencmark) ensures that the loop
; rotate implementation doesn't assume that loops are laid out in a particular
; order. The first loop will get split into two basic blocks, with the loop
; header coming after the loop latch.
;
; CHECK: test_loop_rotate_reversed_blocks
; CHECK: %entry
; Look for a jump into the middle of the loop, and no branches mid-way.
; CHECK: jmp
; CHECK: %loop1
; CHECK-NOT: j{{\w*}} .LBB{{.*}}
; CHECK: %loop1
; CHECK: je
entry:
%cond1 = load volatile i1* undef
br i1 %cond1, label %loop2.preheader, label %loop1
loop1:
call i32 @f()
%cond2 = load volatile i1* undef
br i1 %cond2, label %loop2.preheader, label %loop1
loop2.preheader:
call i32 @f()
%cond3 = load volatile i1* undef
br i1 %cond3, label %exit, label %loop2
loop2:
call i32 @f()
%cond4 = load volatile i1* undef
br i1 %cond4, label %exit, label %loop2
exit:
ret void
}
define i32 @test_loop_align(i32 %i, i32* %a) {
; Check that we provide basic loop body alignment with the block placement
; pass.

View File

@ -31,6 +31,7 @@ entry:
; CHECK-LABEL: test3:
; CHECK: movzbl 8(%esp), %eax
; CHECK-NEXT: imull $171, %eax
; CHECK-NEXT: andl $65024, %eax
; CHECK-NEXT: shrl $9, %eax
; CHECK-NEXT: ret
}
@ -56,9 +57,10 @@ entry:
%div = sdiv i16 %x, 10
ret i16 %div
; CHECK-LABEL: test6:
; CHECK: imull $26215, %eax, %ecx
; CHECK: sarl $18, %ecx
; CHECK: shrl $15, %eax
; CHECK: imull $26215, %eax
; CHECK: movl %eax, %ecx
; CHECK: shrl $31, %ecx
; CHECK: sarl $18, %eax
}
define i32 @test7(i32 %x) nounwind {

View File

@ -1,117 +0,0 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
; i386 test has been disabled when scheduler 2-addr hack is disabled.
; This testcase shouldn't need to spill the -1 value,
; so it should just use pcmpeqd to materialize an all-ones vector.
; For i386, cp load of -1 are folded.
; With -regalloc=greedy, the live range is split before spilling, so the first
; pcmpeq doesn't get folded as a constant pool load.
; I386-NOT: pcmpeqd
; I386: orps LCPI0_2, %xmm
; I386-NOT: pcmpeqd
; I386: orps LCPI0_2, %xmm
; X86-64: pcmpeqd
; X86-64-NOT: pcmpeqd
%struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
%struct._cl_image_format_t = type <{ i32, i32, i32 }>
%struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
entry:
%tmp3.i = load i32* null ; <i32> [#uses=1]
%cmp = icmp sgt i32 %tmp3.i, 200 ; <i1> [#uses=1]
br i1 %cmp, label %forcond, label %ifthen
ifthen: ; preds = %entry
ret void
forcond: ; preds = %entry
%tmp3.i536 = load i32* null ; <i32> [#uses=1]
%cmp12 = icmp slt i32 0, %tmp3.i536 ; <i1> [#uses=1]
br i1 %cmp12, label %forbody, label %afterfor
forbody: ; preds = %forcond
%bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
%mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1]
%mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1]
%mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1]
%tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
%bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
%bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
%tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1]
%tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1]
%sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2]
%mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1]
%add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1]
%mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1]
%add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
%bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1]
%andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1]
%bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1]
%mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1]
%bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1]
%andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1]
%xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
%orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1]
%bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1]
%cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1]
%tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
%sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1]
%bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
%andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0]
%mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1]
%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1]
%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1]
%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1]
%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1]
%bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1]
%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1]
%bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1]
%xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
%orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1]
%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1]
%mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1]
%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2]
%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1]
%tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
%bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1]
%andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1]
%orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1]
%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1]
%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1]
%bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2]
%andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1]
%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1]
%not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1]
%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1]
%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1]
%bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1]
%andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1]
%orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1]
%bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1]
call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
unreachable
afterfor: ; preds = %forcond
ret void
}
declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone

View File

@ -30,40 +30,6 @@ while.end: ; preds = %while.cond
ret void
}
; DAGCombiner shouldn't fold the sdiv (ashr) away.
; rdar://8636812
; CHECK-LABEL: test2:
; CHECK: sarl
define i32 @test2() nounwind {
entry:
%i = alloca i32, align 4
%j = alloca i8, align 1
store i32 127, i32* %i, align 4
store i8 0, i8* %j, align 1
%tmp3 = load i32* %i, align 4
%mul = mul nsw i32 %tmp3, 2
%conv4 = trunc i32 %mul to i8
%conv5 = sext i8 %conv4 to i32
%div6 = sdiv i32 %conv5, 2
%conv7 = trunc i32 %div6 to i8
%conv9 = sext i8 %conv7 to i32
%cmp = icmp eq i32 %conv9, -1
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
ret i32 0
if.end: ; preds = %entry
call void @abort() noreturn
unreachable
}
declare void @abort() noreturn
declare void @exit(i32) noreturn
; DAG Combiner can't fold this into a load of the 1'th byte.
; PR8757
define i32 @test3(i32 *%P) nounwind ssp {

View File

@ -34,8 +34,8 @@ entry:
; X64: movb %sil, 1(%rdi)
; X32-LABEL: test2:
; X32: movb 8(%esp), %[[REG:[abcd]l]]
; X32: movb %[[REG]], 1(%{{.*}})
; X32: movzbl 8(%esp), %e[[REG:[abcd]]]x
; X32: movb %[[REG]]l, 1(%{{.*}})
}
define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {

View File

@ -4,8 +4,8 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
; CHECK-LABEL: @t1
; CHECK: movl 4(%esp), %[[R0:e[abcd]x]]
; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]]
; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]]
; CHECK-NEXT: movl %[[R2]], (%[[R0]])
; CHECK-NEXT: movss 12(%[[R1]]), %[[R2:xmm.*]]
; CHECK-NEXT: movss %[[R2]], (%[[R0]])
; CHECK-NEXT: retl
%X = load <4 x float>* %P1