1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00
llvm-mirror/test/CodeGen/X86/avx-vbroadcast.ll
Simon Pilgrim 1ab70808c2 [X86][AVX] Access a scalar float/double as a free extract from a broadcast load (PR43217)
If a fp scalar is loaded and then used as both a scalar and a vector broadcast, perform the load as a broadcast and then extract the scalar for 'free' from the 0th element.

This involved switching the order of the X86ISD::BROADCAST combines so we only convert to X86ISD::BROADCAST_LOAD once all other canonicalizations have been attempted.

Adds a DAGCombinerInfo::recursivelyDeleteUnusedNodes wrapper.

Fixes PR43217

Differential Revision: https://reviews.llvm.org/D68544

llvm-svn: 373871
2019-10-06 21:11:45 +00:00

978 lines
34 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: A:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: A:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: A2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %esi
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %esi, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %esi
; X32-NEXT: vbroadcastsd (%ecx), %ymm0
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: movl %esi, 4(%eax)
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: A2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: B:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
ret <8 x i32> %vecinit6.i
}
define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: B2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
%vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
%vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
%vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
%vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
ret <8 x i32> %vecinit14.i
}
define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: B3:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %ecx
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: vmovd %ecx, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
%vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
%vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
%vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
%vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
ret <8 x i32> %vecinit14.i
}
define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: C:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: C:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 8
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: C2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastsd (%ecx), %ymm0
; X32-NEXT: vmovlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: C2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: vmovlps %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 8
store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
ret <8 x float> %vecinit6.i
}
define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
%vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
%vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
%vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
%vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
ret <8 x float> %vecinit14.i
}
define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: D3:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %ymm0
; X32-NEXT: vmovss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: D3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
%vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
%vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
%vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
%vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
ret <8 x float> %vecinit14.i
}
;;;; 128-bit versions
define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: e:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: e:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
ret <4 x float> %vecinit6.i
}
define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: e2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %xmm0
; X32-NEXT: vmovss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
ret <4 x float> %vecinit6.i
}
; Don't broadcast constants on pre-AVX2 hardware.
define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X32-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: retq
entry:
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
ret <4 x float> %vecinit6.i
}
define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: F:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: F:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
ret <4 x i32> %vecinit6.i
}
define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: F2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %ecx
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: vmovd %ecx, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: F2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
ret <4 x i32> %vecinit6.i
}
; FIXME: Pointer adjusted broadcasts
define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i32_4i32_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i32_4i32_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %ret
}
define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_4i32_33333333:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x i32> %ret
}
define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_8i32_55555555:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_8i32_55555555:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x i32>, <8 x i32>* %ptr
%ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %ret
}
define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f32_4f32_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 4(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f32_4f32_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
; X64-NEXT: retq
entry:
%ld = load <4 x float>, <4 x float>* %ptr
%ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x float> %ret
}
define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_4f32_33333333:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_4f32_33333333:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x float>, <4 x float>* %ptr
%ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x float> %ret
}
define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_8f32_55555555:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_8f32_55555555:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x float>, <8 x float>* %ptr
%ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x float> %ret
}
define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2i64_2i64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
%ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %ret
}
define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_2i64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
%ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i64> %ret
}
define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_4i64_2222:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_4i64_2222:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i64>, <4 x i64>* %ptr
%ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x i64> %ret
}
define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2f64_2f64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
%ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %ret
}
define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_2f64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_2f64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
%ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x double> %ret
}
define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_4f64_2222:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_4f64_2222:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x double>, <4 x double>* %ptr
%ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x double> %ret
}
; Unsupported vbroadcasts
define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: G:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: G:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
ret <2 x i64> %vecinit2.i
}
define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: G2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %esi
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %esi, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %esi
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: movl %esi, 4(%eax)
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: G2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
ret <2 x i64> %vecinit2.i
}
define <4 x i32> @H(<4 x i32> %a) {
; X32-LABEL: H:
; X32: ## %bb.0: ## %entry
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: H:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
ret <4 x i32> %x
}
define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: I:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: I:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 4
%vecinit.i = insertelement <2 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
ret <2 x double> %vecinit2.i
}
define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: I2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: vmovlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: I2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: vmovlps %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 4
store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <2 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
ret <2 x double> %vecinit2.i
}
define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-LABEL: _RR:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %xmm0
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: movl %eax, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: _RR:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: movl %eax, (%rax)
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
; force a chain
%j = load i32, i32* %k, align 4
store i32 %j, i32* undef
ret <4 x float> %vecinit6.i
}
define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-LABEL: _RR2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _RR2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%v = insertelement <4 x float> undef, float %q, i32 0
%t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %t
}
; These tests check that a vbroadcast instruction is used when we have a splat
; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
; (via the insertelements).
define <8 x float> @splat_concat1(float* %p) {
; X32-LABEL: splat_concat1:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat1:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, float* %p, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = insertelement <4 x float> %2, float %1, i32 1
%4 = insertelement <4 x float> %3, float %1, i32 2
%5 = insertelement <4 x float> %4, float %1, i32 3
%6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %6
}
define <8 x float> @splat_concat2(float* %p) {
; X32-LABEL: splat_concat2:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat2:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, float* %p, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = insertelement <4 x float> %2, float %1, i32 1
%4 = insertelement <4 x float> %3, float %1, i32 2
%5 = insertelement <4 x float> %4, float %1, i32 3
%6 = insertelement <4 x float> undef, float %1, i32 0
%7 = insertelement <4 x float> %6, float %1, i32 1
%8 = insertelement <4 x float> %7, float %1, i32 2
%9 = insertelement <4 x float> %8, float %1, i32 3
%10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %10
}
define <4 x double> @splat_concat3(double* %p) {
; X32-LABEL: splat_concat3:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat3:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p, align 8
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = insertelement <2 x double> %2, double %1, i32 1
%4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x double> %4
}
define <4 x double> @splat_concat4(double* %p) {
; X32-LABEL: splat_concat4:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat4:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p, align 8
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = insertelement <2 x double> %2, double %1, i32 1
%4 = insertelement <2 x double> undef, double %1, i32 0
%5 = insertelement <2 x double> %2, double %1, i32 1
%6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %6
}
; PR34041
define <4 x double> @broadcast_shuffle_1000(double* %p) {
; X32-LABEL: broadcast_shuffle_1000:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: broadcast_shuffle_1000:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x double> %3
}
define <4 x double> @broadcast_shuffle1032(double* %p) {
; X32-LABEL: broadcast_shuffle1032:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: broadcast_shuffle1032:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p
%2 = insertelement <2 x double> undef, double %1, i32 1
%3 = insertelement <2 x double> undef, double %1, i32 0
%4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x double> %4
}
define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) {
; X32-LABEL: broadcast_v16i32:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %ymm0
; X32-NEXT: vmovups %ymm0, 32(%eax)
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: broadcast_v16i32:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: vmovups %ymm0, 32(%rsi)
; X64-NEXT: vmovups %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%1 = load i32, i32* %a, align 4
%2 = insertelement <8 x i32> undef, i32 %1, i32 0
%3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
%4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
store <16 x i32> %4, <16 x i32>* %b, align 4
ret void
}
;
; Broadcast scale factor for xyz vector - slp will have vectorized xy.
;
define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind {
; X32-LABEL: broadcast_scale_xyz:
; X32: ## %bb.0:
; X32-NEXT: subl $12, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: vmulpd (%eax), %xmm0, %xmm1
; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovsd %xmm0, (%esp)
; X32-NEXT: fldl (%esp)
; X32-NEXT: addl $12, %esp
; X32-NEXT: retl
;
; X64-LABEL: broadcast_scale_xyz:
; X64: ## %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1
; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%3 = bitcast double* %1 to <2 x double>*
%4 = load <2 x double>, <2 x double>* %3, align 8
%5 = getelementptr inbounds double, double* %1, i64 2
%6 = load double, double* %5, align 8
%7 = load double, double* %0, align 8
%8 = insertelement <2 x double> undef, double %7, i32 0
%9 = shufflevector <2 x double> %8, <2 x double> undef, <2 x i32> zeroinitializer
%10 = fmul <2 x double> %4, %9
%11 = fmul double %6, %7
%12 = extractelement <2 x double> %10, i32 0
%13 = extractelement <2 x double> %10, i32 1
%14 = fadd double %12, %13
%15 = fadd double %11, %14
ret double %15
}
;
; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
;
define float @broadcast_lifetime() nounwind {
; X32-LABEL: broadcast_lifetime:
; X32: ## %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: subl $40, %esp
; X32-NEXT: leal {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll _gfunc
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll _gfunc
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: flds {{[0-9]+}}(%esp)
; X32-NEXT: addl $40, %esp
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: broadcast_lifetime:
; X64: ## %bb.0:
; X64-NEXT: subq $40, %rsp
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
; X64-NEXT: addq $40, %rsp
; X64-NEXT: retq
%1 = alloca <4 x float>, align 16
%2 = alloca <4 x float>, align 16
%3 = bitcast <4 x float>* %1 to i8*
%4 = bitcast <4 x float>* %2 to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
call void @gfunc(<4 x float>* %1)
%5 = load <4 x float>, <4 x float>* %1, align 16
call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
call void @llvm.lifetime.start.p0i8(i64 16, i8* %4)
call void @gfunc(<4 x float>* %2)
%6 = load <4 x float>, <4 x float>* %2, align 16
call void @llvm.lifetime.end.p0i8(i64 16, i8* %4)
%7 = extractelement <4 x float> %5, i32 1
%8 = extractelement <4 x float> %6, i32 1
%9 = fsub float %8, %7
ret float %9
}
declare void @gfunc(<4 x float>*)
declare void @llvm.lifetime.start.p0i8(i64, i8*)
declare void @llvm.lifetime.end.p0i8(i64, i8*)