2012-09-16 09:39:07 +02:00
|
|
|
; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
|
|
|
|
|
|
|
|
; rdar://11897677
|
|
|
|
|
2013-07-19 00:47:09 +02:00
|
|
|
;CHECK-LABEL: intrin_pmov:
|
2012-09-16 09:58:47 +02:00
|
|
|
;CHECK: pmovzxbw (%{{.*}}), %xmm0
|
2012-09-16 09:39:07 +02:00
|
|
|
;CHECK-NEXT: movdqu
|
|
|
|
;CHECK-NEXT: ret
|
|
|
|
define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable ssp {
|
|
|
|
%1 = bitcast i8* %src to <2 x i64>*
|
|
|
|
%2 = load <2 x i64>* %1, align 16
|
|
|
|
%3 = bitcast <2 x i64> %2 to <16 x i8>
|
|
|
|
%4 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %3) nounwind
|
|
|
|
%5 = bitcast i16* %dest to i8*
|
|
|
|
%6 = bitcast <8 x i16> %4 to <16 x i8>
|
|
|
|
tail call void @llvm.x86.sse2.storeu.dq(i8* %5, <16 x i8> %6) nounwind
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
|
|
|
|
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
|
x86: Move bitcasts outside concat_vector.
Consider the following:
typedef unsigned short ushort4U __attribute__((ext_vector_type(4),
aligned(2)));
typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
typedef unsigned short ushort8 __attribute__((ext_vector_type(8)));
typedef int int4 __attribute__((ext_vector_type(4)));
int4 __bbase_cvt_int(ushort4 v) {
ushort8 a;
a.lo = v;
return _mm_cvtepu16_epi32(a);
}
This generates the, not unreasonable, IR:
define <4 x i32> @foo0(double %v.coerce) nounwind ssp {
%tmp = bitcast double %v.coerce to <4 x i16>
%tmp1 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32
%0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%tmp2 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
The problem is when type legalization gets hold of the v4i16. It
legalizes that by spilling to the stack, then doing a zero-extending
load. Things go even more silly from there, ending up with something
like:
_foo0:
movsd %xmm0, -8(%rsp) <== Spill to the stack.
movq -8(%rsp), %xmm0 <== Reload it right back out.
pmovzxwd %xmm0, %xmm1 <== Here's what we actually asked for.
pblendw $1, %xmm1, %xmm0 <== We don't need this at all
pmovzxwd %xmm0, %xmm0 <== We already did this
ret
The v8i8 to v8i16 zext intrinsic gives even worse results, with two
table lookups via pshufb instructions(!!).
To avoid all that, we can move the bitcasting until after we've formed
the wider (legal) vector type. Then our normal codegen flows along
nicely and we get the expected:
_foo0:
pmovzxwd %xmm0, %xmm0
ret
rdar://15245794
llvm-svn: 192866
2013-10-17 04:58:06 +02:00
|
|
|
|
|
|
|
; rdar://15245794
|
|
|
|
|
|
|
|
define <4 x i32> @foo0(double %v.coerce) nounwind ssp {
|
|
|
|
; CHECK-LABEL: foo0
|
|
|
|
; CHECK: pmovzxwd %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
%tmp = bitcast double %v.coerce to <4 x i16>
|
|
|
|
%tmp1 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
|
|
%tmp2 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp1) nounwind
|
|
|
|
ret <4 x i32> %tmp2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @foo1(double %v.coerce) nounwind ssp {
|
|
|
|
; CHECK-LABEL: foo1
|
|
|
|
; CHECK: pmovzxbw %xmm0, %xmm0
|
|
|
|
; CHECK-NEXT: ret
|
|
|
|
%tmp = bitcast double %v.coerce to <8 x i8>
|
|
|
|
%tmp1 = shufflevector <8 x i8> %tmp, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
|
|
%tmp2 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %tmp1)
|
|
|
|
ret <8 x i16> %tmp2
|
|
|
|
}
|
|
|
|
|
|
|
|
declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
|