1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

[MachineSink] sink more profitable loads

Reviewed By: qcolombet

Differential Revision: https://reviews.llvm.org/D86864
This commit is contained in:
Chen Zheng 2020-11-01 20:55:05 -05:00
parent 5f9db1f559
commit 39e1336a34
8 changed files with 296 additions and 235 deletions

View File

@ -127,6 +127,12 @@ namespace {
/// current block.
DenseSet<DebugVariable> SeenDbgVars;
std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>, bool>
HasStoreCache;
std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>,
std::vector<MachineInstr *>>
StoreInstrCache;
public:
static char ID; // Pass identification
@ -159,6 +165,9 @@ namespace {
MachineBasicBlock *From,
MachineBasicBlock *To);
bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To,
MachineInstr &MI);
/// Postpone the splitting of the given critical
/// edge (\p From, \p To).
///
@ -359,6 +368,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
EverMadeChange = true;
}
HasStoreCache.clear();
StoreInstrCache.clear();
// Now clear any kill flags for recorded registers.
for (auto I : RegsToClearKillFlags)
MRI->clearKillFlags(I);
@ -919,6 +931,73 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
}
}
/// hasStoreBetween - check if there is store betweeen straight line blocks From
/// and To.
bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
MachineBasicBlock *To, MachineInstr &MI) {
// Make sure From and To are in straight line which means From dominates To
// and To post dominates From.
if (!DT->dominates(From, To) || !PDT->dominates(To, From))
return true;
auto BlockPair = std::make_pair(From, To);
// Does these two blocks pair be queried before and have a definite cached
// result?
if (HasStoreCache.find(BlockPair) != HasStoreCache.end())
return HasStoreCache[BlockPair];
if (StoreInstrCache.find(BlockPair) != StoreInstrCache.end())
return std::any_of(
StoreInstrCache[BlockPair].begin(), StoreInstrCache[BlockPair].end(),
[&](MachineInstr *I) { return I->mayAlias(AA, MI, false); });
bool SawStore = false;
bool HasAliasedStore = false;
DenseSet<MachineBasicBlock *> HandledBlocks;
// Go through all reachable blocks from From.
for (MachineBasicBlock *BB : depth_first(From)) {
// We insert the instruction at the start of block To, so no need to worry
// about stores inside To.
// Store in block From should be already considered when just enter function
// SinkInstruction.
if (BB == To || BB == From)
continue;
// We already handle this BB in previous iteration.
if (HandledBlocks.count(BB))
continue;
HandledBlocks.insert(BB);
// To post dominates BB, it must be a path from block From.
if (PDT->dominates(To, BB)) {
for (MachineInstr &I : *BB) {
// Treat as alias conservatively for a call or an ordered memory
// operation.
if (I.isCall() || I.hasOrderedMemoryRef()) {
HasStoreCache[BlockPair] = true;
return true;
}
if (I.mayStore()) {
SawStore = true;
// We still have chance to sink MI if all stores between are not
// aliased to MI.
// Cache all store instructions, so that we don't need to go through
// all From reachable blocks for next load instruction.
if (I.mayAlias(AA, MI, false))
HasAliasedStore = true;
StoreInstrCache[BlockPair].push_back(&I);
}
}
}
}
// If there is no store at all, cache the result.
if (!SawStore)
HasStoreCache[BlockPair] = false;
return HasAliasedStore;
}
/// SinkInstruction - Determine whether it is safe to sink the specified machine
/// instruction out of its current block into a successor.
bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
@ -979,8 +1058,9 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
bool TryBreak = false;
bool store = true;
if (!MI.isSafeToMove(AA, store)) {
bool Store =
MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
if (!MI.isSafeToMove(AA, Store)) {
LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
TryBreak = true;
}

View File

@ -40,40 +40,39 @@ define i128 @cmovcc128(i64 signext %a, i128 %b, i128 %c) nounwind {
; RV32I-LABEL: cmovcc128:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: xori a1, a1, 123
; RV32I-NEXT: or a2, a1, a2
; RV32I-NEXT: mv a1, a3
; RV32I-NEXT: beqz a2, .LBB1_2
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: mv a2, a3
; RV32I-NEXT: beqz a1, .LBB1_2
; RV32I-NEXT: # %bb.1: # %entry
; RV32I-NEXT: mv a1, a4
; RV32I-NEXT: mv a2, a4
; RV32I-NEXT: .LBB1_2: # %entry
; RV32I-NEXT: lw a6, 0(a1)
; RV32I-NEXT: beqz a2, .LBB1_6
; RV32I-NEXT: beqz a1, .LBB1_5
; RV32I-NEXT: # %bb.3: # %entry
; RV32I-NEXT: addi a1, a4, 4
; RV32I-NEXT: lw a5, 0(a1)
; RV32I-NEXT: bnez a2, .LBB1_7
; RV32I-NEXT: addi a7, a4, 4
; RV32I-NEXT: bnez a1, .LBB1_6
; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: addi a1, a3, 8
; RV32I-NEXT: lw a1, 0(a1)
; RV32I-NEXT: bnez a2, .LBB1_8
; RV32I-NEXT: addi a5, a3, 8
; RV32I-NEXT: j .LBB1_7
; RV32I-NEXT: .LBB1_5:
; RV32I-NEXT: addi a2, a3, 12
; RV32I-NEXT: j .LBB1_9
; RV32I-NEXT: .LBB1_6:
; RV32I-NEXT: addi a1, a3, 4
; RV32I-NEXT: lw a5, 0(a1)
; RV32I-NEXT: beqz a2, .LBB1_4
; RV32I-NEXT: addi a7, a3, 4
; RV32I-NEXT: beqz a1, .LBB1_4
; RV32I-NEXT: .LBB1_6: # %entry
; RV32I-NEXT: addi a5, a4, 8
; RV32I-NEXT: .LBB1_7: # %entry
; RV32I-NEXT: addi a1, a4, 8
; RV32I-NEXT: lw a6, 0(a2)
; RV32I-NEXT: lw a7, 0(a7)
; RV32I-NEXT: lw a2, 0(a5)
; RV32I-NEXT: beqz a1, .LBB1_9
; RV32I-NEXT: # %bb.8: # %entry
; RV32I-NEXT: addi a1, a4, 12
; RV32I-NEXT: j .LBB1_10
; RV32I-NEXT: .LBB1_9:
; RV32I-NEXT: addi a1, a3, 12
; RV32I-NEXT: .LBB1_10: # %entry
; RV32I-NEXT: lw a1, 0(a1)
; RV32I-NEXT: beqz a2, .LBB1_5
; RV32I-NEXT: .LBB1_8: # %entry
; RV32I-NEXT: addi a2, a4, 12
; RV32I-NEXT: .LBB1_9: # %entry
; RV32I-NEXT: lw a2, 0(a2)
; RV32I-NEXT: sw a2, 12(a0)
; RV32I-NEXT: sw a1, 8(a0)
; RV32I-NEXT: sw a5, 4(a0)
; RV32I-NEXT: sw a1, 12(a0)
; RV32I-NEXT: sw a2, 8(a0)
; RV32I-NEXT: sw a7, 4(a0)
; RV32I-NEXT: sw a6, 0(a0)
; RV32I-NEXT: ret
;
@ -124,40 +123,39 @@ entry:
define i128 @cmov128(i1 %a, i128 %b, i128 %c) nounwind {
; RV32I-LABEL: cmov128:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: andi a4, a1, 1
; RV32I-NEXT: mv a1, a2
; RV32I-NEXT: bnez a4, .LBB3_2
; RV32I-NEXT: andi a1, a1, 1
; RV32I-NEXT: mv a4, a2
; RV32I-NEXT: bnez a1, .LBB3_2
; RV32I-NEXT: # %bb.1: # %entry
; RV32I-NEXT: mv a1, a3
; RV32I-NEXT: mv a4, a3
; RV32I-NEXT: .LBB3_2: # %entry
; RV32I-NEXT: lw a6, 0(a1)
; RV32I-NEXT: bnez a4, .LBB3_6
; RV32I-NEXT: bnez a1, .LBB3_5
; RV32I-NEXT: # %bb.3: # %entry
; RV32I-NEXT: addi a1, a3, 4
; RV32I-NEXT: lw a5, 0(a1)
; RV32I-NEXT: beqz a4, .LBB3_7
; RV32I-NEXT: addi a7, a3, 4
; RV32I-NEXT: beqz a1, .LBB3_6
; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: addi a1, a2, 8
; RV32I-NEXT: lw a1, 0(a1)
; RV32I-NEXT: beqz a4, .LBB3_8
; RV32I-NEXT: addi a5, a2, 8
; RV32I-NEXT: j .LBB3_7
; RV32I-NEXT: .LBB3_5:
; RV32I-NEXT: addi a2, a2, 12
; RV32I-NEXT: j .LBB3_9
; RV32I-NEXT: .LBB3_6:
; RV32I-NEXT: addi a1, a2, 4
; RV32I-NEXT: lw a5, 0(a1)
; RV32I-NEXT: bnez a4, .LBB3_4
; RV32I-NEXT: addi a7, a2, 4
; RV32I-NEXT: bnez a1, .LBB3_4
; RV32I-NEXT: .LBB3_6: # %entry
; RV32I-NEXT: addi a5, a3, 8
; RV32I-NEXT: .LBB3_7: # %entry
; RV32I-NEXT: addi a1, a3, 8
; RV32I-NEXT: lw a6, 0(a4)
; RV32I-NEXT: lw a7, 0(a7)
; RV32I-NEXT: lw a4, 0(a5)
; RV32I-NEXT: bnez a1, .LBB3_9
; RV32I-NEXT: # %bb.8: # %entry
; RV32I-NEXT: addi a1, a3, 12
; RV32I-NEXT: j .LBB3_10
; RV32I-NEXT: .LBB3_9:
; RV32I-NEXT: addi a1, a2, 12
; RV32I-NEXT: .LBB3_10: # %entry
; RV32I-NEXT: lw a1, 0(a1)
; RV32I-NEXT: bnez a4, .LBB3_5
; RV32I-NEXT: .LBB3_8: # %entry
; RV32I-NEXT: addi a2, a3, 12
; RV32I-NEXT: .LBB3_9: # %entry
; RV32I-NEXT: lw a2, 0(a2)
; RV32I-NEXT: sw a2, 12(a0)
; RV32I-NEXT: sw a1, 8(a0)
; RV32I-NEXT: sw a5, 4(a0)
; RV32I-NEXT: sw a1, 12(a0)
; RV32I-NEXT: sw a4, 8(a0)
; RV32I-NEXT: sw a7, 4(a0)
; RV32I-NEXT: sw a6, 0(a0)
; RV32I-NEXT: ret
;

View File

@ -14,40 +14,40 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: movslq (%rdi), %rax
; CHECK-NEXT: movslq (%rdi), %rdi
; CHECK-NEXT: movslq (%rsi), %r8
; CHECK-NEXT: movslq (%rdx), %r10
; CHECK-NEXT: movl (%rcx), %edi
; CHECK-NEXT: movslq (%r9), %rcx
; CHECK-NEXT: movq %rsp, %rdx
; CHECK-NEXT: subl %eax, %r8d
; CHECK-NEXT: movslq %r8d, %rsi
; CHECK-NEXT: movl (%rcx), %esi
; CHECK-NEXT: movq %rsp, %rcx
; CHECK-NEXT: subl %edi, %r8d
; CHECK-NEXT: movslq %r8d, %rdx
; CHECK-NEXT: js .LBB0_1
; CHECK-NEXT: # %bb.11: # %b63
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_14
; CHECK-NEXT: # %bb.12:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_13: # %a25b
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: je .LBB0_13
; CHECK-NEXT: .LBB0_14: # %b85
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.15:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_16: # %a25b140
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: je .LBB0_16
; CHECK-NEXT: .LBB0_1: # %a29b
; CHECK-NEXT: cmpl %r10d, %edi
; CHECK-NEXT: cmpl %r10d, %esi
; CHECK-NEXT: js .LBB0_10
; CHECK-NEXT: # %bb.2: # %b158
; CHECK-NEXT: movslq (%r9), %rsi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: movb $1, %r10b
@ -77,7 +77,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: js .LBB0_4
; CHECK-NEXT: # %bb.17: # %b179
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_18
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_37: # %a30b
@ -97,7 +97,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: je .LBB0_19
; CHECK-NEXT: .LBB0_4: # %a33b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: orl %r8d, %eax
; CHECK-NEXT: movl %eax, %r9d
; CHECK-NEXT: shrl $31, %r9d
@ -106,7 +106,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: .LBB0_5: # %a50b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: movl %r8d, %eax
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: movl %eax, %r11d
; CHECK-NEXT: shrl $31, %r11d
; CHECK-NEXT: testl %eax, %eax
@ -156,7 +156,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_21 Depth 3
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: js .LBB0_22
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_21: # %a35b
@ -169,14 +169,14 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_28: # %b1016
; CHECK-NEXT: # in Loop: Header=BB0_26 Depth=2
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_6
; CHECK-NEXT: .LBB0_26: # %b858
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_38 Depth 3
; CHECK-NEXT: # Child Loop BB0_29 Depth 3
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_27
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_38: # %a53b
@ -194,38 +194,38 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # Parent Loop BB0_26 Depth=2
; CHECK-NEXT: # => This Inner Loop Header: Depth=3
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: jle .LBB0_29
; CHECK-NEXT: jmp .LBB0_28
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_32: # %b1263
; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: jle .LBB0_7
; CHECK-NEXT: .LBB0_30: # %b1117
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_39 Depth 3
; CHECK-NEXT: # Child Loop BB0_33 Depth 3
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: js .LBB0_31
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_39: # %a63b
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # Parent Loop BB0_30 Depth=2
; CHECK-NEXT: # => This Inner Loop Header: Depth=3
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_39
; CHECK-NEXT: .LBB0_31: # %b1139
; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_32
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_33: # %a63b1266
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # Parent Loop BB0_30 Depth=2
; CHECK-NEXT: # => This Inner Loop Header: Depth=3
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_33
; CHECK-NEXT: jmp .LBB0_32
; CHECK-NEXT: .p2align 4, 0x90
@ -237,7 +237,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_24 Depth 3
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_25
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_24: # %a45b

View File

@ -16,31 +16,30 @@ define void @foo(i8* nocapture %_stubArgs) nounwind {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $152, %rsp
; CHECK-NEXT: movq 48(%rdi), %rax
; CHECK-NEXT: movl 64(%rdi), %edx
; CHECK-NEXT: movl 64(%rdi), %ecx
; CHECK-NEXT: movl $200, %esi
; CHECK-NEXT: addl 68(%rdi), %esi
; CHECK-NEXT: imull $46, %edx, %ecx
; CHECK-NEXT: addq %rsi, %rcx
; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: imull $47, %edx, %edx
; CHECK-NEXT: imull $46, %ecx, %edx
; CHECK-NEXT: addq %rsi, %rdx
; CHECK-NEXT: shlq $4, %rdx
; CHECK-NEXT: movaps (%rax,%rdx), %xmm0
; CHECK-NEXT: imull $47, %ecx, %ecx
; CHECK-NEXT: addq %rsi, %rcx
; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: cmpl $0, (%rdi)
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.2: # %entry
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: je .LBB0_4
; CHECK-NEXT: jmp .LBB0_5
; CHECK-NEXT: .LBB0_1:
; CHECK-NEXT: movaps (%rax,%rcx), %xmm1
; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jne .LBB0_5
; CHECK-NEXT: .LBB0_4: # %entry
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: .LBB0_5: # %entry
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_1:
; CHECK-NEXT: movaps (%rax,%rdx), %xmm0
; CHECK-NEXT: .LBB0_3: # %entry
; CHECK-NEXT: movaps (%rax,%rcx), %xmm1
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jne .LBB0_5
; CHECK-NEXT: # %bb.4: # %entry
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: .LBB0_5: # %entry
; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq $152, %rsp
; CHECK-NEXT: retq
entry:

View File

@ -358,44 +358,57 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
;
; NOGATHER-LABEL: masked_gather_v8i32:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2
; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0
; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; NOGATHER-NEXT: vpmovmskb %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm3, %rcx
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB6_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_4: # %else2
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB6_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: je .LBB6_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm2
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB6_7
; NOGATHER-NEXT: je .LBB6_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_8: # %else8
; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: jne .LBB6_9
; NOGATHER-NEXT: je .LBB6_10
; NOGATHER-NEXT: # %bb.9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_10: # %else11
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB6_12
; NOGATHER-NEXT: .LBB6_11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: # %bb.11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_12: # %else14
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $64, %al
; NOGATHER-NEXT: jne .LBB6_13
; NOGATHER-NEXT: # %bb.14: # %else17
@ -404,26 +417,6 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: .LBB6_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB6_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB6_8
; NOGATHER-NEXT: .LBB6_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: je .LBB6_10
; NOGATHER-NEXT: .LBB6_9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: jne .LBB6_11
; NOGATHER-NEXT: jmp .LBB6_12
; NOGATHER-NEXT: .LBB6_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
@ -472,44 +465,58 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
;
; NOGATHER-LABEL: masked_gather_v8float:
; NOGATHER: # %bb.0: # %entry
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2
; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2
; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0
; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; NOGATHER-NEXT: vpmovmskb %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm3, %rcx
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; NOGATHER-NEXT: .LBB7_2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB7_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],mem[0],xmm1[2,3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_4: # %else2
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $4, %al
; NOGATHER-NEXT: jne .LBB7_5
; NOGATHER-NEXT: # %bb.6: # %else5
; NOGATHER-NEXT: je .LBB7_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0,1],mem[0],xmm1[3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_6: # %else5
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: jne .LBB7_7
; NOGATHER-NEXT: je .LBB7_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_8: # %else8
; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: jne .LBB7_9
; NOGATHER-NEXT: je .LBB7_10
; NOGATHER-NEXT: # %bb.9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_10: # %else11
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB7_12
; NOGATHER-NEXT: .LBB7_11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: # %bb.11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_12: # %else14
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $64, %al
; NOGATHER-NEXT: jne .LBB7_13
; NOGATHER-NEXT: # %bb.14: # %else17
@ -518,27 +525,6 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: .LBB7_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB7_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],mem[0],xmm1[3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB7_8
; NOGATHER-NEXT: .LBB7_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: testb $16, %al
; NOGATHER-NEXT: je .LBB7_10
; NOGATHER-NEXT: .LBB7_9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: jne .LBB7_11
; NOGATHER-NEXT: jmp .LBB7_12
; NOGATHER-NEXT: .LBB7_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2

View File

@ -165,14 +165,13 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
; NOCMOV-NEXT: fnstsw %ax
; NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
; NOCMOV-NEXT: sahf
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx
; NOCMOV-NEXT: jne .LBB4_3
; NOCMOV-NEXT: # %bb.1: # %entry
; NOCMOV-NEXT: jp .LBB4_3
; NOCMOV-NEXT: # %bb.2: # %entry
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx
; NOCMOV-NEXT: .LBB4_3: # %entry
; NOCMOV-NEXT: movl (%eax), %ecx
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx
; NOCMOV-NEXT: jne .LBB4_6
; NOCMOV-NEXT: # %bb.4: # %entry
@ -181,7 +180,6 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx
; NOCMOV-NEXT: .LBB4_6: # %entry
; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; NOCMOV-NEXT: movl (%edx), %edx
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi
; NOCMOV-NEXT: jne .LBB4_9
; NOCMOV-NEXT: # %bb.7: # %entry
@ -189,6 +187,8 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
; NOCMOV-NEXT: # %bb.8: # %entry
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi
; NOCMOV-NEXT: .LBB4_9: # %entry
; NOCMOV-NEXT: movl (%ecx), %ecx
; NOCMOV-NEXT: movl (%edx), %edx
; NOCMOV-NEXT: movl (%esi), %esi
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edi
; NOCMOV-NEXT: jne .LBB4_12

View File

@ -557,63 +557,59 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB7_1
; MCU-NEXT: # %bb.2:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
; MCU-NEXT: movl (%eax), %eax
; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
; MCU-NEXT: je .LBB7_5
; MCU-NEXT: .LBB7_4:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
; MCU-NEXT: movl (%ecx), %ecx
; MCU-NEXT: je .LBB7_8
; MCU-NEXT: .LBB7_7:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: je .LBB7_11
; MCU-NEXT: .LBB7_10:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
; MCU-NEXT: movl (%edi), %edi
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: je .LBB7_14
; MCU-NEXT: .LBB7_13:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
; MCU-NEXT: movl (%ebx), %ebx
; MCU-NEXT: je .LBB7_17
; MCU-NEXT: .LBB7_16:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: jmp .LBB7_18
; MCU-NEXT: .LBB7_1:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
; MCU-NEXT: movl (%eax), %eax
; MCU-NEXT: jmp .LBB7_15
; MCU-NEXT: .LBB7_1:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
; MCU-NEXT: jne .LBB7_4
; MCU-NEXT: .LBB7_5:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
; MCU-NEXT: movl (%ecx), %ecx
; MCU-NEXT: jne .LBB7_7
; MCU-NEXT: .LBB7_8:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: jne .LBB7_10
; MCU-NEXT: .LBB7_11:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
; MCU-NEXT: movl (%edi), %edi
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: jne .LBB7_13
; MCU-NEXT: .LBB7_14:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
; MCU-NEXT: movl (%ebx), %ebx
; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
; MCU-NEXT: .LBB7_15:
; MCU-NEXT: movl (%edi), %ebx
; MCU-NEXT: movl (%ecx), %edi
; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: movl (%ebp), %ecx
; MCU-NEXT: movl (%eax), %eax
; MCU-NEXT: jne .LBB7_16
; MCU-NEXT: .LBB7_17:
; MCU-NEXT: # %bb.17:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: jmp .LBB7_18
; MCU-NEXT: .LBB7_16:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: .LBB7_18:
; MCU-NEXT: movl (%ebp), %ebp
; MCU-NEXT: decl %ebp
; MCU-NEXT: decl %ebx
; MCU-NEXT: decl %edi
; MCU-NEXT: decl %esi
; MCU-NEXT: decl %ecx
; MCU-NEXT: decl %eax
; MCU-NEXT: movl %eax, 20(%edx)
; MCU-NEXT: movl %ecx, 16(%edx)
; MCU-NEXT: decl %ecx
; MCU-NEXT: decl %esi
; MCU-NEXT: decl %edi
; MCU-NEXT: decl %ebx
; MCU-NEXT: movl %ebx, 20(%edx)
; MCU-NEXT: movl %edi, 16(%edx)
; MCU-NEXT: movl %esi, 12(%edx)
; MCU-NEXT: movl %edi, 8(%edx)
; MCU-NEXT: movl %ebx, 4(%edx)
; MCU-NEXT: movl %ecx, 8(%edx)
; MCU-NEXT: movl %eax, 4(%edx)
; MCU-NEXT: movl %ebp, (%edx)
; MCU-NEXT: popl %esi
; MCU-NEXT: popl %edi

View File

@ -4361,7 +4361,6 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-LABEL: uitofp_load_4i64_to_4f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
@ -4377,6 +4376,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB83_3:
; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
@ -4710,40 +4710,38 @@ define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-LABEL: uitofp_load_8i64_to_8f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_1
; SSE2-NEXT: # %bb.2:
; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
; SSE2-NEXT: jmp .LBB87_3
; SSE2-NEXT: .LBB87_1:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: addss %xmm3, %xmm3
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
; SSE2-NEXT: addss %xmm2, %xmm2
; SSE2-NEXT: .LBB87_3:
; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_4
; SSE2-NEXT: # %bb.5:
; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: jmp .LBB87_6
; SSE2-NEXT: .LBB87_4:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: addss %xmm4, %xmm4
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB87_6:
; SSE2-NEXT: movq %xmm5, %rax
; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_7
; SSE2-NEXT: # %bb.8:
@ -4759,55 +4757,59 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB87_9:
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm5, %rax
; SSE2-NEXT: movdqa 48(%rdi), %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_10
; SSE2-NEXT: # %bb.11:
; SSE2-NEXT: cvtsi2ss %rax, %xmm6
; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: jmp .LBB87_12
; SSE2-NEXT: .LBB87_10:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm6
; SSE2-NEXT: addss %xmm6, %xmm6
; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: addss %xmm4, %xmm4
; SSE2-NEXT: .LBB87_12:
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: movq %xmm6, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_13
; SSE2-NEXT: # %bb.14:
; SSE2-NEXT: xorps %xmm5, %xmm5
; SSE2-NEXT: cvtsi2ss %rax, %xmm5
; SSE2-NEXT: xorps %xmm3, %xmm3
; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: jmp .LBB87_15
; SSE2-NEXT: .LBB87_13:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
; SSE2-NEXT: xorps %xmm5, %xmm5
; SSE2-NEXT: cvtsi2ss %rax, %xmm5
; SSE2-NEXT: addss %xmm5, %xmm5
; SSE2-NEXT: xorps %xmm3, %xmm3
; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: addss %xmm3, %xmm3
; SSE2-NEXT: .LBB87_15:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: movdqa 32(%rdi), %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm6, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_16
; SSE2-NEXT: # %bb.17:
; SSE2-NEXT: cvtsi2ss %rax, %xmm7
; SSE2-NEXT: xorps %xmm6, %xmm6
; SSE2-NEXT: cvtsi2ss %rax, %xmm6
; SSE2-NEXT: jmp .LBB87_18
; SSE2-NEXT: .LBB87_16:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm7
; SSE2-NEXT: addss %xmm7, %xmm7
; SSE2-NEXT: xorps %xmm6, %xmm6
; SSE2-NEXT: cvtsi2ss %rax, %xmm6
; SSE2-NEXT: addss %xmm6, %xmm6
; SSE2-NEXT: .LBB87_18:
; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: movq %xmm5, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_19
; SSE2-NEXT: # %bb.20:
@ -4823,9 +4825,9 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB87_21:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_22
@ -4843,7 +4845,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: addss %xmm2, %xmm2
; SSE2-NEXT: .LBB87_24:
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_8i64_to_8f32: