mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
a9b961b8da
This adds a peephole optimisation to turn a t2MOVccr that could not be folded into any other instruction into a CSEL on 8.1-m. The t2MOVccr would usually be expanded into a conditional mov, that becomes an IT; MOV pair. We can instead generate a CSEL instruction, which can potentially be smaller and allows better register allocation freedom, which can help reduce codesize. Performance is more variable and may depend on the micrarchitecture details, but initial results look good. If we need to control this per-cpu, we can add a subtarget feature as we need it. Original patch by David Penry. Differential Revision: https://reviews.llvm.org/D83566
76 lines
2.4 KiB
LLVM
76 lines
2.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
define arm_aapcs_vfpcc <16 x i8> @abs_v16i8(<16 x i8> %s1) {
|
|
; CHECK-LABEL: abs_v16i8:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vabs.s8 q0, q0
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%0 = icmp slt <16 x i8> %s1, zeroinitializer
|
|
%1 = sub nsw <16 x i8> zeroinitializer, %s1
|
|
%2 = select <16 x i1> %0, <16 x i8> %1, <16 x i8> %s1
|
|
ret <16 x i8> %2
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <8 x i16> @abs_v8i16(<8 x i16> %s1) {
|
|
; CHECK-LABEL: abs_v8i16:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vabs.s16 q0, q0
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%0 = icmp slt <8 x i16> %s1, zeroinitializer
|
|
%1 = sub nsw <8 x i16> zeroinitializer, %s1
|
|
%2 = select <8 x i1> %0, <8 x i16> %1, <8 x i16> %s1
|
|
ret <8 x i16> %2
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <4 x i32> @abs_v4i32(<4 x i32> %s1) {
|
|
; CHECK-LABEL: abs_v4i32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vabs.s32 q0, q0
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%0 = icmp slt <4 x i32> %s1, zeroinitializer
|
|
%1 = sub nsw <4 x i32> zeroinitializer, %s1
|
|
%2 = select <4 x i1> %0, <4 x i32> %1, <4 x i32> %s1
|
|
ret <4 x i32> %2
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) {
|
|
; CHECK-LABEL: abs_v2i64:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r7, lr}
|
|
; CHECK-NEXT: push {r7, lr}
|
|
; CHECK-NEXT: vmov r1, s0
|
|
; CHECK-NEXT: mov.w r12, #0
|
|
; CHECK-NEXT: vmov r0, s1
|
|
; CHECK-NEXT: rsbs.w lr, r1, #0
|
|
; CHECK-NEXT: sbc.w r2, r12, r0
|
|
; CHECK-NEXT: cmp r0, #0
|
|
; CHECK-NEXT: cset r3, mi
|
|
; CHECK-NEXT: ands r3, r3, #1
|
|
; CHECK-NEXT: csel r1, lr, r1, ne
|
|
; CHECK-NEXT: csel r0, r2, r0, ne
|
|
; CHECK-NEXT: vmov.32 q1[0], r1
|
|
; CHECK-NEXT: vmov r1, s2
|
|
; CHECK-NEXT: vmov.32 q1[1], r0
|
|
; CHECK-NEXT: vmov r0, s3
|
|
; CHECK-NEXT: rsbs r2, r1, #0
|
|
; CHECK-NEXT: sbc.w r12, r12, r0
|
|
; CHECK-NEXT: cmp r0, #0
|
|
; CHECK-NEXT: cset r3, mi
|
|
; CHECK-NEXT: ands r3, r3, #1
|
|
; CHECK-NEXT: csel r1, r2, r1, ne
|
|
; CHECK-NEXT: csel r0, r12, r0, ne
|
|
; CHECK-NEXT: vmov.32 q1[2], r1
|
|
; CHECK-NEXT: vmov.32 q1[3], r0
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: pop {r7, pc}
|
|
entry:
|
|
%0 = icmp slt <2 x i64> %s1, zeroinitializer
|
|
%1 = sub nsw <2 x i64> zeroinitializer, %s1
|
|
%2 = select <2 x i1> %0, <2 x i64> %1, <2 x i64> %s1
|
|
ret <2 x i64> %2
|
|
}
|