1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
David Green b0d54a04ab [ARM] Change VDUP type to i32 for MVE
The MVE VDUP instruction take a GPR and splats into every lane of a
vector register. Unlike NEON we do not have a VDUPLANE equivalent
instruction, doing the same splat from a fp register. Previously a VDUP
to a v4f32/v8f16 would be represented as a (v4f32 VDUP f32), which
would mean the instruction pattern needs to add a COPY_TO_REGCLASS to
the GPR.

Instead this now converts that earlier during an ISel DAG combine,
converting (VDUP x) to (VDUP (bitcast x)). This can allow instruction
selection to tell that the input needs to be an i32, which in one of the
testcases allows it to use ldr (or specifically ldm) over (vldr;vmov).

Whilst being simple enough for floats, as the types sizes are the same,
these is no BITCAST equivalent for getting a half into a i32. This uses
a VMOVrh ARMISD node, which doesn't know the same tricks yet.

Differential Revision: https://reviews.llvm.org/D76292
2020-03-20 09:48:45 +00:00

353 lines
14 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulq_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i8 q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = mul <16 x i8> %b, %a
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulq_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i16 q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = mul <8 x i16> %b, %a
ret <8 x i16> %0
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulq_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = mul <4 x i32> %b, %a
ret <4 x i32> %0
}
define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulq_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = fmul <4 x float> %a, %b
ret <4 x float> %0
}
define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_m_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i8 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_m_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i32 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_m_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.f16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
ret <8 x half> %2
}
declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_x_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_x_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <4 x float> @test_vmulq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmulq_m_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.f32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
ret <4 x float> %2
}
declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) {
; CHECK-LABEL: test_vmulq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i8 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
%0 = mul <16 x i8> %.splat, %a
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) {
; CHECK-LABEL: test_vmulq_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i16 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = mul <8 x i16> %.splat, %a
ret <8 x i16> %0
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) {
; CHECK-LABEL: test_vmulq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%0 = mul <4 x i32> %.splat, %a
ret <4 x i32> %0
}
define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
; CHECK-LABEL: test_vmulq_n_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmul.f32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <4 x float> undef, float %b, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%0 = fmul <4 x float> %.splat, %a
ret <4 x float> %0
}
define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_m_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i8 q0, q1, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i16 q0, q1, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_m_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i32 q0, q1, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_m_n_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f16 r1, s8
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.f16 q0, q1, r1
; CHECK-NEXT: bx lr
entry:
%0 = bitcast float %b.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%.splatinsert = insertelement <8 x half> undef, half %1, i32 0
%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
%2 = zext i16 %p to i32
%3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
%4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive)
ret <8 x half> %4
}
define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_x_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i8 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
%.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_x_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i16 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_x_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) {
; CHECK-LABEL: test_vmulq_x_n_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.f32 q0, q0, r1
; CHECK-NEXT: bx lr
entry:
%.splatinsert = insertelement <4 x float> undef, float %b, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%0 = zext i16 %p to i32
%1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef)
ret <4 x float> %2
}