1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-26 14:33:02 +02:00
llvm-mirror/test/CodeGen/ARM/vbsl.ll
Jim Grosbach 135898ebe3 ARM: Use a dedicated intrinsic for vector bitwise select.
The expression based expansion too often results in IR level optimizations
splitting the intermediate values into separate basic blocks, preventing
the formation of the VBSL instruction as the code author intended. In
particular, LICM would often hoist part of the computation out of a loop.

rdar://11011471

llvm-svn: 164340
2012-09-21 00:18:20 +00:00

155 lines
5.3 KiB
LLVM

; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: v_bsli8:
;CHECK: vbsl
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = and <8 x i8> %tmp1, %tmp2
%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp6 = and <8 x i8> %tmp5, %tmp3
%tmp7 = or <8 x i8> %tmp4, %tmp6
ret <8 x i8> %tmp7
}
define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: v_bsli16:
;CHECK: vbsl
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = and <4 x i16> %tmp1, %tmp2
%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp6 = and <4 x i16> %tmp5, %tmp3
%tmp7 = or <4 x i16> %tmp4, %tmp6
ret <4 x i16> %tmp7
}
define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: v_bsli32:
;CHECK: vbsl
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = and <2 x i32> %tmp1, %tmp2
%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
%tmp6 = and <2 x i32> %tmp5, %tmp3
%tmp7 = or <2 x i32> %tmp4, %tmp6
ret <2 x i32> %tmp7
}
define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
;CHECK: v_bsli64:
;CHECK: vbsl
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = load <1 x i64>* %C
%tmp4 = and <1 x i64> %tmp1, %tmp2
%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
%tmp6 = and <1 x i64> %tmp5, %tmp3
%tmp7 = or <1 x i64> %tmp4, %tmp6
ret <1 x i64> %tmp7
}
define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
;CHECK: v_bslQi8:
;CHECK: vbsl
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
%tmp4 = and <16 x i8> %tmp1, %tmp2
%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp6 = and <16 x i8> %tmp5, %tmp3
%tmp7 = or <16 x i8> %tmp4, %tmp6
ret <16 x i8> %tmp7
}
define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
;CHECK: v_bslQi16:
;CHECK: vbsl
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
%tmp4 = and <8 x i16> %tmp1, %tmp2
%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp6 = and <8 x i16> %tmp5, %tmp3
%tmp7 = or <8 x i16> %tmp4, %tmp6
ret <8 x i16> %tmp7
}
define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
;CHECK: v_bslQi32:
;CHECK: vbsl
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
%tmp4 = and <4 x i32> %tmp1, %tmp2
%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp6 = and <4 x i32> %tmp5, %tmp3
%tmp7 = or <4 x i32> %tmp4, %tmp6
ret <4 x i32> %tmp7
}
define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
;CHECK: v_bslQi64:
;CHECK: vbsl
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = load <2 x i64>* %C
%tmp4 = and <2 x i64> %tmp1, %tmp2
%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
%tmp6 = and <2 x i64> %tmp5, %tmp3
%tmp7 = or <2 x i64> %tmp4, %tmp6
ret <2 x i64> %tmp7
}
define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
; CHECK: f1:
; CHECK: vbsl
%vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
ret <8 x i8> %vbsl.i
}
define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK: f2:
; CHECK: vbsl
%vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
ret <4 x i16> %vbsl3.i
}
define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK: f3:
; CHECK: vbsl
%vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
ret <2 x i32> %vbsl3.i
}
define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
; CHECK: g1:
; CHECK: vbsl
%vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
ret <16 x i8> %vbsl.i
}
define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
; CHECK: g2:
; CHECK: vbsl
%vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
ret <8 x i16> %vbsl3.i
}
define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
; CHECK: g3:
; CHECK: vbsl
%vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
ret <4 x i32> %vbsl3.i
}
declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone