1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
Sanjay Patel 2c86671523 [Intrinsics] define semantics for experimental fmax/fmin vector reductions
As discussed on llvm-dev:
http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html

This is hopefully the final remaining showstopper before we can remove
the 'experimental' from the reduction intrinsics.

No behavior was specified for the FP min/max reductions, so we have a
mess of different interpretations.

There are a few potential options for the semantics of these max/min ops.
I think this is the simplest based on current behavior/implementation:
make the reductions inherit from the existing llvm.maxnum/minnum intrinsics.
These correspond to libm fmax/fmin, and those are similar to the (now
deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing
data). So the default expansion creates calls to libm functions.

Another option would be to inherit from llvm.maximum/minimum (NaNs propagate),
but most targets just crash in codegen when given those nodes because no
default expansion was ever implemented AFAICT.

We could also just assume 'nnan' semantics by default (we are already
assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets
(AArch64, PowerPC) support the more defined behavior, so it doesn't make much
sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to
loosen the semantics.

(Note that D67507 was proposed to update the LangRef to acknowledge the more
recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do
update based on the new standard, the reduction instructions can seamlessly
inherit from whatever updates are made to the max/min intrinsics.)

x86 sees a regression here on 'nnan' tests because we have underlying,
longstanding bugs in FMF creation/propagation. Those need to be fixed apart
from this change (for example: https://llvm.org/PR35538). The expansion
sequence before this patch may not have been correct.

Differential Revision: https://reviews.llvm.org/D87391
2020-09-12 09:10:28 -04:00

1854 lines
65 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) {
; CHECK-LABEL: fmin_v2f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f32 s0, s0, s1
; CHECK-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) {
; CHECK-FP-LABEL: fmin_v4f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1
; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2
; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
ret float %z
}
; FIXME fminnum (vector) -> fminnum (scalar) ?
define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
; CHECK-FP-LABEL: fmin_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vcmp.f32 s5, s1
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s4, s0
; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s5
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s6, s2
; CHECK-NOFP-NEXT: vselgt.f32 s10, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s7, s3
; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vminnm.f32 s2, s10, s8
; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s12
; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
; CHECK-FP-LABEL: fmin_v4f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
; CHECK-FP-LABEL: fmin_v8f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
; CHECK-FP-LABEL: fmin_v16f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v16f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vcmp.f16 s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s4, s0
; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s5, s1
; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vcmp.f16 s10, s12
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s6, s2
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vcmp.f16 s10, s12
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s7, s3
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc double @fmin_v1f64(<1 x double> %x) {
; CHECK-LABEL: fmin_v1f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmin_v2f64(<2 x double> %x) {
; CHECK-LABEL: fmin_v2f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d0, d0, d1
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmin_v4f64(<4 x double> %x) {
; CHECK-LABEL: fmin_v4f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.f64 d3, d1
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vcmp.f64 d2, d0
; CHECK-NEXT: vselgt.f64 d4, d1, d3
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d0, d2
; CHECK-NEXT: vminnm.f64 d0, d0, d4
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc float @fmin_v2f32_nofast(<2 x float> %x) {
; CHECK-LABEL: fmin_v2f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f32 s0, s0, s1
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) {
; CHECK-FP-LABEL: fmin_v4f32_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f32_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1
; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2
; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) {
; CHECK-FP-LABEL: fmin_v8f32_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f32_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vminnm.f32 s10, s0, s4
; CHECK-NOFP-NEXT: vminnm.f32 s8, s1, s5
; CHECK-NOFP-NEXT: vminnm.f32 s8, s10, s8
; CHECK-NOFP-NEXT: vminnm.f32 s10, s2, s6
; CHECK-NOFP-NEXT: vminnm.f32 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) {
; CHECK-FP-LABEL: fmin_v4f16_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f16_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) {
; CHECK-FP-LABEL: fmin_v8f16_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f16_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) {
; CHECK-FP-LABEL: fmin_v16f16_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v16f16_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc double @fmin_v1f64_nofast(<1 x double> %x) {
; CHECK-LABEL: fmin_v1f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmin_v2f64_nofast(<2 x double> %x) {
; CHECK-LABEL: fmin_v2f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d0, d0, d1
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) {
; CHECK-LABEL: fmin_v4f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d4, d1, d3
; CHECK-NEXT: vminnm.f64 d0, d0, d2
; CHECK-NEXT: vminnm.f64 d0, d0, d4
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) {
; CHECK-LABEL: fmin_v2f32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f32 s0, s0, s1
; CHECK-NEXT: vminnm.f32 s0, s4, s0
; CHECK-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
%c = fcmp fast olt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) {
; CHECK-FP-LABEL: fmin_v4f32_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6
; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f32_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1
; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2
; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3
; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
%c = fcmp fast olt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
; CHECK-FP-LABEL: fmin_v8f32_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: vminnm.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f32_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vcmp.f32 s5, s1
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s4, s0
; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s6, s2
; CHECK-NOFP-NEXT: vselgt.f32 s12, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s7, s3
; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vminnm.f32 s2, s12, s10
; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s14
; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0
; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
%c = fcmp fast olt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmin_v4f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f16_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
%c = fcmp fast olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) {
; CHECK-LABEL: fmin_v2f16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vminnm.f16 s0, s0, s4
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: vminnm.f16 s0, s2, s0
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x)
%c = fcmp fast olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmin_v8f16_acc(<8 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmin_v8f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f16_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
%c = fcmp fast olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmin_v16f16_acc(<16 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmin_v16f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v16f16_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vcmp.f16 s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s4, s0
; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s5, s1
; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vcmp.f16 s10, s12
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s6, s2
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vcmp.f16 s10, s12
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s7, s3
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
%c = fcmp fast olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc double @fmin_v1f64_acc(<1 x double> %x, double %y) {
; CHECK-LABEL: fmin_v1f64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d0, d1, d0
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
%c = fcmp fast olt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmin_v2f64_acc(<2 x double> %x, double %y) {
; CHECK-LABEL: fmin_v2f64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d0, d0, d1
; CHECK-NEXT: vminnm.f64 d0, d2, d0
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
%c = fcmp fast olt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmin_v4f64_acc(<4 x double> %x, double %y) {
; CHECK-LABEL: fmin_v4f64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.f64 d3, d1
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vcmp.f64 d2, d0
; CHECK-NEXT: vselgt.f64 d5, d1, d3
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d0, d2
; CHECK-NEXT: vminnm.f64 d0, d0, d5
; CHECK-NEXT: vminnm.f64 d0, d4, d0
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
%c = fcmp fast olt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc float @fmin_v2f32_acc_nofast(<2 x float> %x, float %y) {
; CHECK-LABEL: fmin_v2f32_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f32 s0, s0, s1
; CHECK-NEXT: vcmp.f32 s0, s4
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f32 s0, s4, s0
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
%c = fcmp olt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) {
; CHECK-FP-LABEL: fmin_v4f32_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6
; CHECK-FP-NEXT: vcmp.f32 s0, s4
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1
; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2
; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3
; CHECK-NOFP-NEXT: vcmp.f32 s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
%c = fcmp olt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) {
; CHECK-FP-LABEL: fmin_v8f32_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f32 s0, s8
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vminnm.f32 s12, s0, s4
; CHECK-NOFP-NEXT: vminnm.f32 s10, s1, s5
; CHECK-NOFP-NEXT: vminnm.f32 s10, s12, s10
; CHECK-NOFP-NEXT: vminnm.f32 s12, s2, s6
; CHECK-NOFP-NEXT: vminnm.f32 s10, s10, s12
; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vminnm.f32 s0, s10, s0
; CHECK-NOFP-NEXT: vcmp.f32 s0, s8
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
%c = fcmp olt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc void @fmin_v4f16_acc_nofast(<4 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmin_v4f16_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f16 s0, s2
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vcmp.f16 s0, s2
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
%c = fcmp olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmin_v8f16_acc_nofast(<8 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmin_v8f16_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f16 s0, s2
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vcmp.f16 s0, s2
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
%c = fcmp olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmin_v16f16_acc_nofast(<16 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmin_v16f16_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f16 s0, s2
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: vcmp.f16 s0, s2
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
%c = fcmp olt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc double @fmin_v1f64_acc_nofast(<1 x double> %x, double %y) {
; CHECK-LABEL: fmin_v1f64_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.f64 d0, d1
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d1, d0
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
%c = fcmp olt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmin_v2f64_acc_nofast(<2 x double> %x, double %y) {
; CHECK-LABEL: fmin_v2f64_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d0, d0, d1
; CHECK-NEXT: vcmp.f64 d0, d2
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d2, d0
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
%c = fcmp olt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) {
; CHECK-LABEL: fmin_v4f64_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vminnm.f64 d5, d1, d3
; CHECK-NEXT: vminnm.f64 d0, d0, d2
; CHECK-NEXT: vminnm.f64 d0, d0, d5
; CHECK-NEXT: vcmp.f64 d0, d4
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d4, d0
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
%c = fcmp olt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) {
; CHECK-LABEL: fmax_v2f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) {
; CHECK-FP-LABEL: fmax_v4f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
; CHECK-FP-LABEL: fmax_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vcmp.f32 s1, s5
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s0, s4
; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s5
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s2, s6
; CHECK-NOFP-NEXT: vselgt.f32 s10, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s3, s7
; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s10, s8
; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s12
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
; CHECK-FP-LABEL: fmax_v4f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
; CHECK-FP-LABEL: fmax_v8f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
; CHECK-FP-LABEL: fmax_v16f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v16f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vcmp.f16 s10, s8
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s0, s4
; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s1, s5
; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vcmp.f16 s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s2, s6
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vcmp.f16 s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s3, s7
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc double @fmax_v1f64(<1 x double> %x) {
; CHECK-LABEL: fmax_v1f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmax_v2f64(<2 x double> %x) {
; CHECK-LABEL: fmax_v2f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d0, d0, d1
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmax_v4f64(<4 x double> %x) {
; CHECK-LABEL: fmax_v4f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.f64 d1, d3
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vcmp.f64 d0, d2
; CHECK-NEXT: vselgt.f64 d4, d1, d3
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d0, d2
; CHECK-NEXT: vmaxnm.f64 d0, d0, d4
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc float @fmax_v2f32_nofast(<2 x float> %x) {
; CHECK-LABEL: fmax_v2f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) {
; CHECK-FP-LABEL: fmax_v4f32_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f32_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) {
; CHECK-FP-LABEL: fmax_v8f32_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f32_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s1, s5
; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s10, s8
; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s2, s6
; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) {
; CHECK-FP-LABEL: fmax_v4f16_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f16_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) {
; CHECK-FP-LABEL: fmax_v8f16_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f16_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) {
; CHECK-FP-LABEL: fmax_v16f16_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v16f16_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc double @fmax_v1f64_nofast(<1 x double> %x) {
; CHECK-LABEL: fmax_v1f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmax_v2f64_nofast(<2 x double> %x) {
; CHECK-LABEL: fmax_v2f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d0, d0, d1
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) {
; CHECK-LABEL: fmax_v4f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d4, d1, d3
; CHECK-NEXT: vmaxnm.f64 d0, d0, d2
; CHECK-NEXT: vmaxnm.f64 d0, d0, d4
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
ret double %z
}
define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) {
; CHECK-LABEL: fmax_v2f32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-NEXT: vmaxnm.f32 s0, s4, s0
; CHECK-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
%c = fcmp fast ogt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) {
; CHECK-FP-LABEL: fmax_v4f32_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6
; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f32_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
%c = fcmp fast ogt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
; CHECK-FP-LABEL: fmax_v8f32_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: vmaxnm.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f32_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vcmp.f32 s1, s5
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s0, s4
; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s2, s6
; CHECK-NOFP-NEXT: vselgt.f32 s12, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f32 s3, s7
; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s12, s10
; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s14
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
%c = fcmp fast ogt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) {
; CHECK-LABEL: fmax_v2f16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x)
%c = fcmp fast ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmax_v4f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f16_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
%c = fcmp fast ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmax_v8f16_acc(<8 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmax_v8f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f16_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
%c = fcmp fast ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmax_v16f16_acc(<16 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmax_v16f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v16f16_acc:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vcmp.f16 s10, s8
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s0, s4
; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s1, s5
; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vcmp.f16 s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s2, s6
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vcmp.f16 s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s3, s7
; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vcmp.f16 s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
%c = fcmp fast ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc double @fmax_v1f64_acc(<1 x double> %x, double %y) {
; CHECK-LABEL: fmax_v1f64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d0, d1, d0
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
%c = fcmp fast ogt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmax_v2f64_acc(<2 x double> %x, double %y) {
; CHECK-LABEL: fmax_v2f64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d0, d0, d1
; CHECK-NEXT: vmaxnm.f64 d0, d2, d0
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
%c = fcmp fast ogt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmax_v4f64_acc(<4 x double> %x, double %y) {
; CHECK-LABEL: fmax_v4f64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.f64 d1, d3
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vcmp.f64 d0, d2
; CHECK-NEXT: vselgt.f64 d5, d1, d3
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d0, d2
; CHECK-NEXT: vmaxnm.f64 d0, d0, d5
; CHECK-NEXT: vmaxnm.f64 d0, d4, d0
; CHECK-NEXT: bx lr
entry:
%z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
%c = fcmp fast ogt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc float @fmax_v2f32_acc_nofast(<2 x float> %x, float %y) {
; CHECK-LABEL: fmax_v2f32_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-NEXT: vcmp.f32 s4, s0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f32 s0, s4, s0
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
%c = fcmp ogt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) {
; CHECK-FP-LABEL: fmax_v4f32_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6
; CHECK-FP-NEXT: vcmp.f32 s4, s0
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3
; CHECK-NOFP-NEXT: vcmp.f32 s4, s0
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
%c = fcmp ogt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) {
; CHECK-FP-LABEL: fmax_v8f32_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f32 s8, s0
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s1, s5
; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s12, s10
; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s2, s6
; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s10, s12
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s10, s0
; CHECK-NOFP-NEXT: vcmp.f32 s8, s0
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
%z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
%c = fcmp ogt float %y, %z
%r = select i1 %c, float %y, float %z
ret float %r
}
define arm_aapcs_vfpcc void @fmax_v4f16_acc_nofast(<4 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmax_v4f16_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s4, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6
; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f16 s2, s0
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vcmp.f16 s2, s0
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
%c = fcmp ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmax_v8f16_acc_nofast(<8 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmax_v8f16_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f16 s2, s0
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
; CHECK-NOFP-NEXT: vcmp.f16 s2, s0
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
%c = fcmp ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmax_v16f16_acc_nofast(<16 x half> %x, half* %yy) {
; CHECK-FP-LABEL: fmax_v16f16_acc_nofast:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vcmp.f16 s2, s0
; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s8, s4
; CHECK-NOFP-NEXT: vmovx.f16 s10, s0
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s5
; CHECK-NOFP-NEXT: vmovx.f16 s12, s1
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s12, s2
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmovx.f16 s10, s6
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7
; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0
; CHECK-NOFP-NEXT: vcmp.f16 s2, s0
; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0
; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
%c = fcmp ogt half %y, %z
%r = select i1 %c, half %y, half %z
store half %r, half* %yy
ret void
}
define arm_aapcs_vfpcc double @fmax_v1f64_acc_nofast(<1 x double> %x, double %y) {
; CHECK-LABEL: fmax_v1f64_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.f64 d1, d0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d1, d0
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
%c = fcmp ogt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmax_v2f64_acc_nofast(<2 x double> %x, double %y) {
; CHECK-LABEL: fmax_v2f64_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d0, d0, d1
; CHECK-NEXT: vcmp.f64 d2, d0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d2, d0
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
%c = fcmp ogt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) {
; CHECK-LABEL: fmax_v4f64_acc_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmaxnm.f64 d5, d1, d3
; CHECK-NEXT: vmaxnm.f64 d0, d0, d2
; CHECK-NEXT: vmaxnm.f64 d0, d0, d5
; CHECK-NEXT: vcmp.f64 d4, d0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: vselgt.f64 d0, d4, d0
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
%c = fcmp ogt double %y, %z
%r = select i1 %c, double %y, double %z
ret double %r
}
declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)