1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 18:42:46 +02:00

Take into account the cost of local intervals when selecting split candidate.

When selecting a split candidate for region splitting, the register allocator tries to predict which candidate will have the cheapest spill cost.
Global splitting may cause the creation of local intervals, and they might spill.

This patch makes RA take into account the spill cost of local split intervals in use blocks (we already take into account the spill cost in through blocks).
A flag ("-condsider-local-interval-cost") controls weather we do this advanced cost calculation (it's on by default for X86 target, off for the rest).

Differential Revision: https://reviews.llvm.org/D41585

Change-Id: Icccb8ad2dbf13124f5d97a18c67d95aa6be0d14d
llvm-svn: 323870
This commit is contained in:
Marina Yatsina 2018-01-31 13:31:08 +00:00
parent b3f8010487
commit 13c308caeb
6 changed files with 313 additions and 125 deletions

View File

@ -107,6 +107,13 @@ public:
/// with the highest enum value is returned.
InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg);
/// Check for interference in the segment [Start, End) that may prevent
/// assignment to PhysReg. If this function returns true, there is
/// interference in the segment [Start, End) of some other interval already
/// assigned to PhysReg. If this function returns false, PhysReg is free at
/// the segment [Start, End).
bool checkInterference(SlotIndex Start, SlotIndex End, unsigned PhysReg);
/// Assign VirtReg to PhysReg.
/// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
/// update VirtRegMap. The live range is expected to be available in PhysReg.

View File

@ -205,3 +205,19 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
return IK_Free;
}
bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
unsigned PhysReg) {
// Construct artificial live range containing only one segment [Start, End).
VNInfo valno(0, Start);
LiveRange::Segment Seg(Start, End, &valno);
LiveRange LR;
LR.addSegment(Seg);
// Check for interference with that segment
for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
if (query(LR, *Units).checkInterference())
return true;
}
return false;
}

View File

@ -448,6 +448,9 @@ private:
bool splitCanCauseEvictionChain(unsigned Evictee, GlobalSplitCandidate &Cand,
unsigned BBNumber,
const AllocationOrder &Order);
bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
GlobalSplitCandidate &Cand, unsigned BBNumber,
const AllocationOrder &Order);
BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
const AllocationOrder &Order,
bool *CanCauseEvictionChain);
@ -1427,7 +1430,7 @@ BlockFrequency RAGreedy::calcSpillCost() {
/// we are splitting for and the interferences.
/// \param BBNumber The number of a BB for which the region split process will
/// create a local split interval.
/// \param Order The phisical registers that may get evicted by a split
/// \param Order The physical registers that may get evicted by a split
/// artifact of Evictee.
/// \return True if splitting Evictee may cause a bad eviction chain, false
/// otherwise.
@ -1448,8 +1451,8 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee),
Cand.Intf.first(), Cand.Intf.last(), &MaxWeight);
// The bad eviction chain occurs when either the split candidate the
// evited reg or one of the split artifact will evict the evicting reg.
// The bad eviction chain occurs when either the split candidate is the
// evicting reg or one of the split artifact will evict the evicting reg.
if ((PhysReg != Cand.PhysReg) && (PhysReg != FutureEvictedPhysReg))
return false;
@ -1479,6 +1482,54 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
return true;
}
/// \brief Check if splitting VirtRegToSplit will create a local split interval
/// in basic block number BBNumber that may cause a spill.
///
/// \param VirtRegToSplit The register considered to be split.
/// \param Cand The split candidate that determines the physical
/// register we are splitting for and the interferences.
/// \param BBNumber The number of a BB for which the region split process
/// will create a local split interval.
/// \param Order The physical registers that may get evicted by a
/// split artifact of VirtRegToSplit.
/// \return True if splitting VirtRegToSplit may cause a spill, false
/// otherwise.
bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit,
GlobalSplitCandidate &Cand,
unsigned BBNumber,
const AllocationOrder &Order) {
Cand.Intf.moveToBlock(BBNumber);
// Check if the local interval will find a non interfereing assignment.
for (auto PhysReg : Order.getOrder()) {
if (!Matrix->checkInterference(Cand.Intf.first().getPrevIndex(),
Cand.Intf.last(), PhysReg))
return false;
}
// Check if the local interval will evict a cheaper interval.
float CheapestEvictWeight = 0;
unsigned FutureEvictedPhysReg = getCheapestEvicteeWeight(
Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(),
Cand.Intf.last(), &CheapestEvictWeight);
// Have we found an interval that can be evicted?
if (FutureEvictedPhysReg) {
VirtRegAuxInfo VRAI(*MF, *LIS, VRM, getAnalysis<MachineLoopInfo>(), *MBFI);
float splitArtifactWeight =
VRAI.futureWeight(LIS->getInterval(VirtRegToSplit),
Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
// Will the weight of the local interval be higher than the cheapest evictee
// weight? If so it will evict it and will not cause a spill.
if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight)
return false;
}
// The local interval is not able to find non interferening assignment and not
// able to evict a less worthy interval, therfore, it can cause a spill.
return true;
}
/// calcGlobalSplitCost - Return the global split cost of following the split
/// pattern in LiveBundles. This cost should be added to the local cost of the
/// interference pattern in SplitConstraints.
@ -1499,19 +1550,26 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
Cand.Intf.moveToBlock(BC.Number);
// Check wheather a local interval is going to be created during the region
// split.
if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
Cand.Intf.hasInterference() && BI.LiveIn && BI.LiveOut && RegIn &&
RegOut) {
// split. Calculate adavanced spilt cost (cost of local intervals) if option
// is enabled.
if (EnableAdvancedRASplitCost && Cand.Intf.hasInterference() && BI.LiveIn &&
BI.LiveOut && RegIn && RegOut) {
if (splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
// This interfernce cause our eviction from this assignment, we might
// evict somebody else, add that cost.
if (CanCauseEvictionChain &&
splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
// This interference causes our eviction from this assignment, we might
// evict somebody else and eventually someone will spill, add that cost.
// See splitCanCauseEvictionChain for detailed description of scenarios.
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
*CanCauseEvictionChain = true;
} else if (splitCanCauseLocalSpill(VirtRegToSplit, Cand, BC.Number,
Order)) {
// This interference causes local interval to spill, add that cost.
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
}
}
@ -1540,7 +1598,7 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
// region split.
if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
splitCanCauseEvictionChain(VirtRegToSplit, Cand, Number, Order)) {
// This interfernce cause our eviction from this assignment, we might
// This interference cause our eviction from this assignment, we might
// evict somebody else, add that cost.
// See splitCanCauseEvictionChain for detailed description of
// scenarios.

View File

@ -22,9 +22,10 @@
; CHECK: bb.2.for.body:
; CHECK: SUBPDrr
; CHECK-NEXT: MOVAPSmr
; CHECK-NEXT: MOVAPSrm
; CHECK-NEXT: MULPDrm
; CHECK-NEXT: MOVAPSrm
; CHECK-NEXT: ADDPDrr
; CHECK-NEXT: MOVAPSmr
; CHECK-NEXT: ADD32ri8
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"

View File

@ -0,0 +1,89 @@
; RUN: llc < %s -march=x86 -regalloc=greedy --debug-only=regalloc 2>&1 | FileCheck %s
; This test is meant to make sure that the weight of local intervals that are
; created during split is taken into account when choosing the best candidate
; register.
; %shl is the interval that will be split.
; The inline assembly calls interfere with %shl and make only 2 available split
; candidates - %esi and %ebp.
; The old code would have chosen %esi as the split candidate ignoring the fact
; that this choice will cause the creation of a local interval that will have a
; certain spill cost.
; The new code choses %ebp as the split candidate as it has lower spill cost.
; Make sure the split behaves as expected
; CHECK: RS_Split Cascade 1
; CHECK-NOT: %eax static =
; CHECK: %eax no positive bundles
; CHECK-NEXT: %ecx no positive bundles
; CHECK-NEXT: %edx no positive bundles
; CHECK-NEXT: %esi static =
; CHECK-NEXT: %edi no positive bundles
; CHECK-NEXT: %ebx no positive bundles
; CHECK-NEXT: %ebp static =
; CHECK: Split for %ebp
; Function Attrs: nounwind
define i32 @foo(i32* %array, i32 %cond1, i32 %val) local_unnamed_addr #0 {
entry:
%array.addr = alloca i32*, align 4
store i32* %array, i32** %array.addr, align 4, !tbaa !3
%0 = load i32, i32* %array, align 4, !tbaa !7
%arrayidx1 = getelementptr inbounds i32, i32* %array, i32 1
%1 = load i32, i32* %arrayidx1, align 4, !tbaa !7
%arrayidx2 = getelementptr inbounds i32, i32* %array, i32 2
%2 = load i32, i32* %arrayidx2, align 4, !tbaa !7
%arrayidx3 = getelementptr inbounds i32, i32* %array, i32 3
%3 = load i32, i32* %arrayidx3, align 4, !tbaa !7
%arrayidx4 = getelementptr inbounds i32, i32* %array, i32 4
%4 = load i32, i32* %arrayidx4, align 4, !tbaa !7
%arrayidx6 = getelementptr inbounds i32, i32* %array, i32 %val
%5 = load i32, i32* %arrayidx6, align 4, !tbaa !7
%shl = shl i32 %5, 5
%tobool = icmp eq i32 %cond1, 0
br i1 %tobool, label %if.else, label %if.then
if.then: ; preds = %entry
%arrayidx7 = getelementptr inbounds i32, i32* %array, i32 6
store i32 %shl, i32* %arrayidx7, align 4, !tbaa !7
call void asm "nop", "=*m,r,r,r,r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32** nonnull %array.addr, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32** nonnull %array.addr) #1, !srcloc !9
%6 = load i32*, i32** %array.addr, align 4, !tbaa !3
%arrayidx8 = getelementptr inbounds i32, i32* %6, i32 7
br label %if.end
if.else: ; preds = %entry
%arrayidx5 = getelementptr inbounds i32, i32* %array, i32 5
%7 = load i32, i32* %arrayidx5, align 4, !tbaa !7
%arrayidx9 = getelementptr inbounds i32, i32* %array, i32 8
store i32 %shl, i32* %arrayidx9, align 4, !tbaa !7
call void asm "nop", "=*m,{ax},{bx},{cx},{dx},{di},{si},{ebp},*m,~{dirflag},~{fpsr},~{flags}"(i32** nonnull %array.addr, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %7, i32* undef, i32** nonnull %array.addr) #1, !srcloc !10
%8 = load i32*, i32** %array.addr, align 4, !tbaa !3
%arrayidx10 = getelementptr inbounds i32, i32* %8, i32 9
br label %if.end
if.end: ; preds = %if.else, %if.then
%arrayidx10.sink = phi i32* [ %arrayidx10, %if.else ], [ %arrayidx8, %if.then ]
%9 = phi i32* [ %8, %if.else ], [ %6, %if.then ]
store i32 %shl, i32* %arrayidx10.sink, align 4, !tbaa !7
%10 = load i32, i32* %9, align 4, !tbaa !7
%add = add nsw i32 %10, %shl
ret i32 %add
}
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"NumRegisterParameters", i32 0}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 6.0.0"}
!3 = !{!4, !4, i64 0}
!4 = !{!"any pointer", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = !{!8, !8, i64 0}
!8 = !{!"int", !5, i64 0}
!9 = !{i32 268}
!10 = !{i32 390}

View File

@ -148,21 +148,21 @@ define i32 @sad_32i8() nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm12, %xmm12
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm13, %xmm13
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: pxor %xmm15, %xmm15
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pxor %xmm13, %xmm13
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm15, %xmm15
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa a+1040(%rax), %xmm8
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
@ -216,61 +216,65 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm7
; SSE2-NEXT: pxor %xmm6, %xmm7
; SSE2-NEXT: paddd %xmm7, %xmm13
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
; SSE2-NEXT: paddd %xmm7, %xmm6
; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm4, %xmm6
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm10, %xmm6
; SSE2-NEXT: paddd %xmm4, %xmm6
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
; SSE2-NEXT: paddd %xmm4, %xmm7
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm1, %xmm6
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: paddd %xmm5, %xmm14
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm0, %xmm15
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm8
; SSE2-NEXT: pxor %xmm0, %xmm8
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm8, %xmm0
; SSE2-NEXT: paddd %xmm8, %xmm14
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm15, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: paddd %xmm6, %xmm3
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm15, %xmm0
; SSE2-NEXT: paddd %xmm14, %xmm13
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm3, %xmm4
; SSE2-NEXT: paddd %xmm13, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm13
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm13, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
; SSE2-NEXT: paddd %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@ -400,43 +404,43 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: subq $200, %rsp
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm15, %xmm15
; SSE2-NEXT: pxor %xmm10, %xmm10
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pxor %xmm13, %xmm13
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm11, %xmm11
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movaps a+1040(%rax), %xmm0
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
; SSE2-NEXT: movdqa a+1056(%rax), %xmm15
; SSE2-NEXT: movdqa a+1072(%rax), %xmm4
@ -497,7 +501,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
; SSE2-NEXT: psubd %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE2-NEXT: psubd %xmm0, %xmm15
; SSE2-NEXT: movdqa %xmm7, %xmm0
@ -506,7 +510,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; SSE2-NEXT: psubd %xmm3, %xmm9
; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
@ -539,7 +543,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
; SSE2-NEXT: psubd %xmm13, %xmm2
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm2, %xmm13
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm1
@ -554,15 +558,13 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm4
@ -570,7 +572,6 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm8
@ -578,7 +579,6 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm11, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm11
@ -587,55 +587,64 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: paddd %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm11
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm15, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm15
; SSE2-NEXT: pxor %xmm1, %xmm15
; SSE2-NEXT: paddd %xmm15, %xmm2
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm15, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm15
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm10, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm10
; SSE2-NEXT: pxor %xmm1, %xmm10
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm10
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm6
; SSE2-NEXT: pxor %xmm1, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm3
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm12, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm12
; SSE2-NEXT: pxor %xmm1, %xmm12
; SSE2-NEXT: paddd %xmm12, %xmm5
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm12, %xmm1
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm0, %xmm13
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: pxor %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm9, %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm9, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm7
@ -643,32 +652,40 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm13, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm7
; SSE2-NEXT: pxor %xmm0, %xmm7
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm3, %xmm8
; SSE2-NEXT: paddd %xmm2, %xmm15
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm8, %xmm13
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm5, %xmm0
; SSE2-NEXT: paddd %xmm11, %xmm10
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm10, %xmm1
; SSE2-NEXT: paddd %xmm13, %xmm1
; SSE2-NEXT: paddd %xmm15, %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]