mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-26 14:33:02 +02:00
135898ebe3
The expression based expansion too often results in IR level optimizations splitting the intermediate values into separate basic blocks, preventing the formation of the VBSL instruction as the code author intended. In particular, LICM would often hoist part of the computation out of a loop. rdar://11011471 llvm-svn: 164340
155 lines
5.3 KiB
LLVM
155 lines
5.3 KiB
LLVM
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
|
|
|
|
define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|
;CHECK: v_bsli8:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <8 x i8>* %A
|
|
%tmp2 = load <8 x i8>* %B
|
|
%tmp3 = load <8 x i8>* %C
|
|
%tmp4 = and <8 x i8> %tmp1, %tmp2
|
|
%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
|
|
%tmp6 = and <8 x i8> %tmp5, %tmp3
|
|
%tmp7 = or <8 x i8> %tmp4, %tmp6
|
|
ret <8 x i8> %tmp7
|
|
}
|
|
|
|
define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
|
;CHECK: v_bsli16:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <4 x i16>* %A
|
|
%tmp2 = load <4 x i16>* %B
|
|
%tmp3 = load <4 x i16>* %C
|
|
%tmp4 = and <4 x i16> %tmp1, %tmp2
|
|
%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
|
|
%tmp6 = and <4 x i16> %tmp5, %tmp3
|
|
%tmp7 = or <4 x i16> %tmp4, %tmp6
|
|
ret <4 x i16> %tmp7
|
|
}
|
|
|
|
define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
|
;CHECK: v_bsli32:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <2 x i32>* %A
|
|
%tmp2 = load <2 x i32>* %B
|
|
%tmp3 = load <2 x i32>* %C
|
|
%tmp4 = and <2 x i32> %tmp1, %tmp2
|
|
%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
|
|
%tmp6 = and <2 x i32> %tmp5, %tmp3
|
|
%tmp7 = or <2 x i32> %tmp4, %tmp6
|
|
ret <2 x i32> %tmp7
|
|
}
|
|
|
|
define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
|
|
;CHECK: v_bsli64:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <1 x i64>* %A
|
|
%tmp2 = load <1 x i64>* %B
|
|
%tmp3 = load <1 x i64>* %C
|
|
%tmp4 = and <1 x i64> %tmp1, %tmp2
|
|
%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
|
|
%tmp6 = and <1 x i64> %tmp5, %tmp3
|
|
%tmp7 = or <1 x i64> %tmp4, %tmp6
|
|
ret <1 x i64> %tmp7
|
|
}
|
|
|
|
define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
|
|
;CHECK: v_bslQi8:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <16 x i8>* %A
|
|
%tmp2 = load <16 x i8>* %B
|
|
%tmp3 = load <16 x i8>* %C
|
|
%tmp4 = and <16 x i8> %tmp1, %tmp2
|
|
%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
|
|
%tmp6 = and <16 x i8> %tmp5, %tmp3
|
|
%tmp7 = or <16 x i8> %tmp4, %tmp6
|
|
ret <16 x i8> %tmp7
|
|
}
|
|
|
|
define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
|
|
;CHECK: v_bslQi16:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <8 x i16>* %A
|
|
%tmp2 = load <8 x i16>* %B
|
|
%tmp3 = load <8 x i16>* %C
|
|
%tmp4 = and <8 x i16> %tmp1, %tmp2
|
|
%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
|
|
%tmp6 = and <8 x i16> %tmp5, %tmp3
|
|
%tmp7 = or <8 x i16> %tmp4, %tmp6
|
|
ret <8 x i16> %tmp7
|
|
}
|
|
|
|
define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
|
|
;CHECK: v_bslQi32:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <4 x i32>* %A
|
|
%tmp2 = load <4 x i32>* %B
|
|
%tmp3 = load <4 x i32>* %C
|
|
%tmp4 = and <4 x i32> %tmp1, %tmp2
|
|
%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
|
|
%tmp6 = and <4 x i32> %tmp5, %tmp3
|
|
%tmp7 = or <4 x i32> %tmp4, %tmp6
|
|
ret <4 x i32> %tmp7
|
|
}
|
|
|
|
define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
|
|
;CHECK: v_bslQi64:
|
|
;CHECK: vbsl
|
|
%tmp1 = load <2 x i64>* %A
|
|
%tmp2 = load <2 x i64>* %B
|
|
%tmp3 = load <2 x i64>* %C
|
|
%tmp4 = and <2 x i64> %tmp1, %tmp2
|
|
%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
|
|
%tmp6 = and <2 x i64> %tmp5, %tmp3
|
|
%tmp7 = or <2 x i64> %tmp4, %tmp6
|
|
ret <2 x i64> %tmp7
|
|
}
|
|
|
|
define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
|
|
; CHECK: f1:
|
|
; CHECK: vbsl
|
|
%vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
|
|
ret <8 x i8> %vbsl.i
|
|
}
|
|
|
|
define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
|
|
; CHECK: f2:
|
|
; CHECK: vbsl
|
|
%vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
|
|
ret <4 x i16> %vbsl3.i
|
|
}
|
|
|
|
define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
|
|
; CHECK: f3:
|
|
; CHECK: vbsl
|
|
%vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
|
|
ret <2 x i32> %vbsl3.i
|
|
}
|
|
|
|
define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
|
|
; CHECK: g1:
|
|
; CHECK: vbsl
|
|
%vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
|
|
ret <16 x i8> %vbsl.i
|
|
}
|
|
|
|
define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
|
|
; CHECK: g2:
|
|
; CHECK: vbsl
|
|
%vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
|
|
ret <8 x i16> %vbsl3.i
|
|
}
|
|
|
|
define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
|
|
; CHECK: g3:
|
|
; CHECK: vbsl
|
|
%vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
|
|
ret <4 x i32> %vbsl3.i
|
|
}
|
|
|
|
declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
|
|
declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
|
|
declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
|
|
declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
|
|
declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
|
|
declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
|