mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[x86] Adjust the patterns for lowering X86vzmovl nodes which don't
perform a load to use blendps rather than movss when it is available. For non-loads, blendps is *much* faster. It can execute on two ports in Sandy Bridge and Ivy Bridge, and *three* ports on Haswell. This fixes one of the "regressions" from aggressively taking the "insertion" path in the new vector shuffle lowering. This does highlight one problem with blendps -- it isn't commuted as heavily as it should be. That's future work though. llvm-svn: 219022
This commit is contained in:
parent
e04abe6476
commit
13d884e744
@ -693,6 +693,7 @@ def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
|
||||
def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
|
||||
def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
|
||||
def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
|
||||
def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">;
|
||||
def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
|
||||
def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
|
||||
def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
|
||||
|
@ -612,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
|
||||
|
||||
// Patterns
|
||||
let Predicates = [UseAVX] in {
|
||||
let AddedComplexity = 15 in {
|
||||
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
|
||||
// MOVS{S,D} to the lower bits.
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
|
||||
(VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
|
||||
(VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
|
||||
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
|
||||
(VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
|
||||
(VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
|
||||
|
||||
// Move low f32 and clear high bits.
|
||||
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(VMOVSSrr (v4f32 (V_SET0)),
|
||||
(EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
|
||||
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(VMOVSSrr (v4i32 (V_SET0)),
|
||||
(EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
|
||||
}
|
||||
|
||||
let AddedComplexity = 20 in {
|
||||
// MOVSSrm zeros the high parts of the register; represent this
|
||||
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
|
||||
@ -670,31 +647,10 @@ let Predicates = [UseAVX] in {
|
||||
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
|
||||
}
|
||||
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
|
||||
(v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
|
||||
sub_xmm)>;
|
||||
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
|
||||
(v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i64 0),
|
||||
(v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
|
||||
sub_xmm)>;
|
||||
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
|
||||
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
|
||||
|
||||
// Move low f64 and clear high bits.
|
||||
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(VMOVSDrr (v2f64 (V_SET0)),
|
||||
(EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
|
||||
|
||||
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(VMOVSDrr (v2i64 (V_SET0)),
|
||||
(EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
|
||||
|
||||
// Extract and store.
|
||||
def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
|
||||
addr:$dst),
|
||||
@ -745,7 +701,6 @@ let Predicates = [UseAVX] in {
|
||||
(EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
|
||||
sub_xmm)>;
|
||||
|
||||
|
||||
// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
|
||||
// is during lowering, where it's not possible to recognize the fold cause
|
||||
// it has two uses through a bitcast. One use disappears at isel time and the
|
||||
@ -761,7 +716,7 @@ let Predicates = [UseAVX] in {
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE1] in {
|
||||
let AddedComplexity = 15 in {
|
||||
let Predicates = [NoSSE41], AddedComplexity = 15 in {
|
||||
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
|
||||
// MOVSS to the lower bits.
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
|
||||
@ -795,7 +750,7 @@ let Predicates = [UseSSE1] in {
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE2] in {
|
||||
let AddedComplexity = 15 in {
|
||||
let Predicates = [NoSSE41], AddedComplexity = 15 in {
|
||||
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
|
||||
// MOVSD to the lower bits.
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
|
||||
@ -7576,6 +7531,57 @@ let Predicates = [HasAVX2] in {
|
||||
(VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
|
||||
}
|
||||
|
||||
// Patterns
|
||||
let Predicates = [UseAVX] in {
|
||||
let AddedComplexity = 15 in {
|
||||
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
|
||||
// MOVS{S,D} to the lower bits.
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
|
||||
(VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
|
||||
(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
|
||||
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
|
||||
(VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>;
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
|
||||
(VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
|
||||
|
||||
// Move low f32 and clear high bits.
|
||||
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
|
||||
(VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
|
||||
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
|
||||
(VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
|
||||
}
|
||||
|
||||
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
|
||||
(v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
|
||||
sub_xmm)>;
|
||||
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
|
||||
(v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i64 0),
|
||||
(v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
|
||||
sub_xmm)>;
|
||||
|
||||
// Move low f64 and clear high bits.
|
||||
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
|
||||
(VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
|
||||
|
||||
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
|
||||
(VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE41] in {
|
||||
// With SSE41 we can use blends for these patterns.
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
|
||||
(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
|
||||
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
|
||||
(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
|
||||
(BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
|
||||
}
|
||||
|
||||
|
||||
/// SS41I_ternary_int - SSE 4.1 ternary operator
|
||||
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
|
||||
multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
|
||||
|
@ -228,9 +228,9 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: xorps %xmm2, %xmm2
|
||||
; CHECK-NEXT: xorps %xmm3, %xmm3
|
||||
; CHECK-NEXT: movss %xmm0, %xmm3
|
||||
; CHECK-NEXT: blendps $1, %xmm0, %xmm3
|
||||
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[0,0]
|
||||
; CHECK-NEXT: movss %xmm1, %xmm2
|
||||
; CHECK-NEXT: blendps $1, %xmm1, %xmm2
|
||||
; CHECK-NEXT: orps %xmm3, %xmm2
|
||||
; CHECK-NEXT: movaps %xmm2, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -522,7 +522,7 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
|
||||
; X32-LABEL: shuf_X00A:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: xorps %xmm2, %xmm2
|
||||
; X32-NEXT: movss %xmm0, %xmm2
|
||||
; X32-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0]
|
||||
; X32-NEXT: movaps %xmm2, %xmm0
|
||||
; X32-NEXT: retl
|
||||
@ -530,7 +530,7 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
|
||||
; X64-LABEL: shuf_X00A:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: xorps %xmm2, %xmm2
|
||||
; X64-NEXT: movss %xmm0, %xmm2
|
||||
; X64-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0]
|
||||
; X64-NEXT: movaps %xmm2, %xmm0
|
||||
; X64-NEXT: retq
|
||||
@ -546,7 +546,7 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
|
||||
; X32-LABEL: shuf_X00X:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-NEXT: movss %xmm0, %xmm1
|
||||
; X32-NEXT: blendps $1, %xmm0, %xmm1
|
||||
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
|
||||
; X32-NEXT: movaps %xmm1, %xmm0
|
||||
; X32-NEXT: retl
|
||||
@ -554,7 +554,7 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
|
||||
; X64-LABEL: shuf_X00X:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: xorps %xmm1, %xmm1
|
||||
; X64-NEXT: movss %xmm0, %xmm1
|
||||
; X64-NEXT: blendps $1, %xmm0, %xmm1
|
||||
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
|
||||
; X64-NEXT: movaps %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
@ -570,7 +570,7 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
|
||||
; X32-LABEL: shuf_X0YC:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: xorps %xmm2, %xmm2
|
||||
; X32-NEXT: movss %xmm0, %xmm2
|
||||
; X32-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0]
|
||||
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
|
||||
; X32-NEXT: movaps %xmm2, %xmm0
|
||||
@ -579,7 +579,7 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
|
||||
; X64-LABEL: shuf_X0YC:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: xorps %xmm2, %xmm2
|
||||
; X64-NEXT: movss %xmm0, %xmm2
|
||||
; X64-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0]
|
||||
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
|
||||
; X64-NEXT: movaps %xmm2, %xmm0
|
||||
@ -692,7 +692,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X32-LABEL: i32_shuf_X00A:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: xorps %xmm2, %xmm2
|
||||
; X32-NEXT: movss %xmm0, %xmm2
|
||||
; X32-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0]
|
||||
; X32-NEXT: movaps %xmm2, %xmm0
|
||||
; X32-NEXT: retl
|
||||
@ -700,7 +700,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X64-LABEL: i32_shuf_X00A:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: xorps %xmm2, %xmm2
|
||||
; X64-NEXT: movss %xmm0, %xmm2
|
||||
; X64-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0]
|
||||
; X64-NEXT: movaps %xmm2, %xmm0
|
||||
; X64-NEXT: retq
|
||||
@ -716,7 +716,7 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X32-LABEL: i32_shuf_X00X:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-NEXT: movss %xmm0, %xmm1
|
||||
; X32-NEXT: blendps $1, %xmm0, %xmm1
|
||||
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
|
||||
; X32-NEXT: movaps %xmm1, %xmm0
|
||||
; X32-NEXT: retl
|
||||
@ -724,7 +724,7 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X64-LABEL: i32_shuf_X00X:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: xorps %xmm1, %xmm1
|
||||
; X64-NEXT: movss %xmm0, %xmm1
|
||||
; X64-NEXT: blendps $1, %xmm0, %xmm1
|
||||
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
|
||||
; X64-NEXT: movaps %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
@ -740,7 +740,7 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X32-LABEL: i32_shuf_X0YC:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: xorps %xmm2, %xmm2
|
||||
; X32-NEXT: movss %xmm0, %xmm2
|
||||
; X32-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0]
|
||||
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
|
||||
; X32-NEXT: movaps %xmm2, %xmm0
|
||||
@ -749,7 +749,7 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X64-LABEL: i32_shuf_X0YC:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: xorps %xmm2, %xmm2
|
||||
; X64-NEXT: movss %xmm0, %xmm2
|
||||
; X64-NEXT: blendps $1, %xmm0, %xmm2
|
||||
; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0]
|
||||
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
|
||||
; X64-NEXT: movaps %xmm2, %xmm0
|
||||
|
@ -39,7 +39,7 @@ entry:
|
||||
define <4 x float> @test3(<4 x float> %A) {
|
||||
; CHECK-LABEL: test3:
|
||||
; CHECK: xorps %[[X1:xmm[0-9]+]], %[[X1]]
|
||||
; CHECK-NEXT: movss %xmm0, %[[X1]]
|
||||
; CHECK-NEXT: blendps $1, %xmm0, %[[X1]]
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1]
|
||||
; CHECK-NEXT: retl
|
||||
;
|
||||
|
@ -438,17 +438,38 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
|
||||
}
|
||||
|
||||
define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
|
||||
; SSE-LABEL: shuffle_v4f32_4zzz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: movss %xmm0, %xmm1
|
||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v4f32_4zzz:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: movss %xmm0, %xmm1
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: shuffle_v4f32_4zzz:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v4f32_4zzz:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v4f32_4zzz:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4f32_4zzz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
||||
ret <4 x float> %shuffle
|
||||
@ -639,34 +660,76 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_4zzz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: movss %xmm0, %xmm1
|
||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v4i32_4zzz:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: movss %xmm0, %xmm1
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: shuffle_v4i32_4zzz:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v4i32_4zzz:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v4i32_4zzz:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_4zzz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_z4zz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: movss %xmm0, %xmm1
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v4i32_z4zz:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: movss %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: shuffle_v4i32_z4zz:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v4i32_z4zz:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v4i32_z4zz:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_z4zz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
|
||||
@ -674,17 +737,38 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_zz4z:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: movss %xmm0, %xmm1
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v4i32_zz4z:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: movss %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: shuffle_v4i32_zz4z:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v4i32_zz4z:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v4i32_zz4z:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_zz4z:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
|
||||
@ -692,17 +776,38 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_zuu4:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: movss %xmm0, %xmm1
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v4i32_zuu4:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: movss %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: shuffle_v4i32_zuu4:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v4i32_zuu4:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v4i32_zuu4:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_zuu4:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
|
||||
@ -1031,12 +1136,33 @@ define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
|
||||
}
|
||||
|
||||
define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
|
||||
; SSE-LABEL: insert_reg_and_zero_v4f32:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: movss %xmm0, %xmm1
|
||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: insert_reg_and_zero_v4f32:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: movss %xmm0, %xmm1
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: insert_reg_and_zero_v4f32:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_reg_and_zero_v4f32:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_reg_and_zero_v4f32:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_reg_and_zero_v4f32:
|
||||
; AVX: # BB#0:
|
||||
|
@ -678,8 +678,8 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
|
||||
; AVX1-LABEL: insert_reg_and_zero_v4i64:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovq %rdi, %xmm0
|
||||
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_reg_and_zero_v4i64:
|
||||
@ -697,8 +697,8 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
|
||||
; AVX1-LABEL: insert_mem_and_zero_v4i64:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovq (%rdi), %xmm0
|
||||
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_mem_and_zero_v4i64:
|
||||
|
Loading…
Reference in New Issue
Block a user