1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
David Green a9b961b8da [ARM] CSEL generation
This adds a peephole optimisation to turn a t2MOVccr that could not be
folded into any other instruction into a CSEL on 8.1-m. The t2MOVccr
would usually be expanded into a conditional mov, that becomes an IT;
MOV pair. We can instead generate a CSEL instruction, which can
potentially be smaller and allows better register allocation freedom,
which can help reduce codesize. Performance is more variable and may
depend on the micrarchitecture details, but initial results look good.
If we need to control this per-cpu, we can add a subtarget feature as we
need it.

Original patch by David Penry.

Differential Revision: https://reviews.llvm.org/D83566
2020-07-16 11:10:53 +01:00

76 lines
2.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @abs_v16i8(<16 x i8> %s1) {
; CHECK-LABEL: abs_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vabs.s8 q0, q0
; CHECK-NEXT: bx lr
entry:
%0 = icmp slt <16 x i8> %s1, zeroinitializer
%1 = sub nsw <16 x i8> zeroinitializer, %s1
%2 = select <16 x i1> %0, <16 x i8> %1, <16 x i8> %s1
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @abs_v8i16(<8 x i16> %s1) {
; CHECK-LABEL: abs_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vabs.s16 q0, q0
; CHECK-NEXT: bx lr
entry:
%0 = icmp slt <8 x i16> %s1, zeroinitializer
%1 = sub nsw <8 x i16> zeroinitializer, %s1
%2 = select <8 x i1> %0, <8 x i16> %1, <8 x i16> %s1
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @abs_v4i32(<4 x i32> %s1) {
; CHECK-LABEL: abs_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vabs.s32 q0, q0
; CHECK-NEXT: bx lr
entry:
%0 = icmp slt <4 x i32> %s1, zeroinitializer
%1 = sub nsw <4 x i32> zeroinitializer, %s1
%2 = select <4 x i1> %0, <4 x i32> %1, <4 x i32> %s1
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) {
; CHECK-LABEL: abs_v2i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: rsbs.w lr, r1, #0
; CHECK-NEXT: sbc.w r2, r12, r0
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: cset r3, mi
; CHECK-NEXT: ands r3, r3, #1
; CHECK-NEXT: csel r1, lr, r1, ne
; CHECK-NEXT: csel r0, r2, r0, ne
; CHECK-NEXT: vmov.32 q1[0], r1
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov.32 q1[1], r0
; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: rsbs r2, r1, #0
; CHECK-NEXT: sbc.w r12, r12, r0
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: cset r3, mi
; CHECK-NEXT: ands r3, r3, #1
; CHECK-NEXT: csel r1, r2, r1, ne
; CHECK-NEXT: csel r0, r12, r0, ne
; CHECK-NEXT: vmov.32 q1[2], r1
; CHECK-NEXT: vmov.32 q1[3], r0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: pop {r7, pc}
entry:
%0 = icmp slt <2 x i64> %s1, zeroinitializer
%1 = sub nsw <2 x i64> zeroinitializer, %s1
%2 = select <2 x i1> %0, <2 x i64> %1, <2 x i64> %s1
ret <2 x i64> %2
}