1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-extractstore.ll
David Green 0c8d4b9ca1 [ARM] Optimize fp store of extract to integer store if already available.
Given a floating point store from an extracted vector, with an integer
VGETLANE that already exists, storing the existing VGETLANEu directly
can be better for performance. As the value is known to already be in an
integer registers, this can help reduce fp register pressure, removed
the need for the fp extract and allows use of more integer post-inc
stores not available with vstr.

This can be a bit narrow in scope, but helps with certain biquad kernels
that store shuffled vector elements.

Differential Revision: https://reviews.llvm.org/D96159
2021-02-12 18:34:58 +00:00

190 lines
6.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, half* nocapture %p) {
; CHECK-LABEL: extret1_f16_sf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: ldr r0, [sp, #16]
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: bx lr
%c = fadd <8 x half> %a, %b
%e = extractelement <8 x half> %c, i32 1
store half %e, half* %p, align 2
ret half %e
}
define half @extret4_f16_sf(<8 x half> %a, <8 x half> %b, half* nocapture %p) {
; CHECK-LABEL: extret4_f16_sf:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: ldr r0, [sp, #16]
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vstr.16 s2, [r0]
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: bx lr
%c = fadd <8 x half> %a, %b
%e = extractelement <8 x half> %c, i32 4
store half %e, half* %p, align 2
ret half %e
}
define arm_aapcs_vfpcc half @extret1_f16_hf(<8 x half> %a, <8 x half> %b, half* nocapture %p) {
; CHECK-LABEL: extret1_f16_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: bx lr
%c = fadd <8 x half> %a, %b
%e = extractelement <8 x half> %c, i32 1
store half %e, half* %p, align 2
ret half %e
}
define arm_aapcs_vfpcc half @extret4_f16_hf(<8 x half> %a, <8 x half> %b, half* nocapture %p) {
; CHECK-LABEL: extret4_f16_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vstr.16 s2, [r0]
; CHECK-NEXT: bx lr
%c = fadd <8 x half> %a, %b
%e = extractelement <8 x half> %c, i32 4
store half %e, half* %p, align 2
ret half %e
}
define arm_aapcs_vfpcc <8 x half> @extret1_v8f16_hf(<8 x half> %a, <8 x half> %b, half* nocapture %p) {
; CHECK-LABEL: extret1_v8f16_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov.u16 r1, q0[1]
; CHECK-NEXT: vdup.16 q0, r1
; CHECK-NEXT: strh r1, [r0]
; CHECK-NEXT: bx lr
%c = fadd <8 x half> %a, %b
%e = extractelement <8 x half> %c, i32 1
store half %e, half* %p, align 2
%i = insertelement <8 x half> undef, half %e, i32 0
%s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
ret <8 x half> %s
}
define arm_aapcs_vfpcc <8 x half> @extret4_v8f16_hf(<8 x half> %a, <8 x half> %b, half* nocapture %p) {
; CHECK-LABEL: extret4_v8f16_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: vdup.16 q0, r1
; CHECK-NEXT: strh r1, [r0]
; CHECK-NEXT: bx lr
%c = fadd <8 x half> %a, %b
%e = extractelement <8 x half> %c, i32 4
store half %e, half* %p, align 2
%i = insertelement <8 x half> undef, half %e, i32 0
%s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
ret <8 x half> %s
}
define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, float* nocapture %p) {
; CHECK-LABEL: extret1_f32_sf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: ldr r1, [sp, #16]
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: bx lr
%c = fadd <4 x float> %a, %b
%e = extractelement <4 x float> %c, i32 1
store float %e, float* %p, align 4
ret float %e
}
define float @extret2_f32_sf(<4 x float> %a, <4 x float> %b, float* nocapture %p) {
; CHECK-LABEL: extret2_f32_sf:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: ldr r1, [sp, #16]
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: bx lr
%c = fadd <4 x float> %a, %b
%e = extractelement <4 x float> %c, i32 2
store float %e, float* %p, align 4
ret float %e
}
define arm_aapcs_vfpcc float @extret1_f32_hf(<4 x float> %a, <4 x float> %b, float* nocapture %p) {
; CHECK-LABEL: extret1_f32_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov.f32 s0, s1
; CHECK-NEXT: vstr s1, [r0]
; CHECK-NEXT: bx lr
%c = fadd <4 x float> %a, %b
%e = extractelement <4 x float> %c, i32 1
store float %e, float* %p, align 4
ret float %e
}
define arm_aapcs_vfpcc float @extret2_f32_hf(<4 x float> %a, <4 x float> %b, float* nocapture %p) {
; CHECK-LABEL: extret2_f32_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vstr s2, [r0]
; CHECK-NEXT: bx lr
%c = fadd <4 x float> %a, %b
%e = extractelement <4 x float> %c, i32 2
store float %e, float* %p, align 4
ret float %e
}
define arm_aapcs_vfpcc <4 x float> @extret1_v4f32_hf(<4 x float> %a, <4 x float> %b, float* nocapture %p) {
; CHECK-LABEL: extret1_v4f32_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f32 q1, q0, q1
; CHECK-NEXT: vmov r1, s5
; CHECK-NEXT: vstr s5, [r0]
; CHECK-NEXT: vdup.32 q0, r1
; CHECK-NEXT: bx lr
%c = fadd <4 x float> %a, %b
%e = extractelement <4 x float> %c, i32 1
store float %e, float* %p, align 4
%i = insertelement <4 x float> undef, float %e, i32 0
%s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %s
}
define arm_aapcs_vfpcc <4 x float> @extret2_v4f32_hf(<4 x float> %a, <4 x float> %b, float* nocapture %p) {
; CHECK-LABEL: extret2_v4f32_hf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f32 q1, q0, q1
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vstr s6, [r0]
; CHECK-NEXT: vdup.32 q0, r1
; CHECK-NEXT: bx lr
%c = fadd <4 x float> %a, %b
%e = extractelement <4 x float> %c, i32 2
store float %e, float* %p, align 4
%i = insertelement <4 x float> undef, float %e, i32 0
%s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %s
}