1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[ARM] MVE big endian loads/stores

This adds some missing patterns for big endian loads/stores, allowing unaligned
loads/stores to also be selected with an extra VREV, which produces better code
than aligning through a stack. Also moves VLDR_P0 to not be LE only, and
adjusts some of the tests to show all that working.

Differential Revision: https://reviews.llvm.org/D65583

llvm-svn: 368304
This commit is contained in:
David Green 2019-08-08 15:15:19 +00:00
parent db4202796c
commit 3265a2671b
6 changed files with 396 additions and 222 deletions

View File

@ -14075,45 +14075,21 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
return true;
}
if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
Ty != MVT::v2f64)
return false;
if (Subtarget->isLittle()) {
// In little-endian MVE, the store instructions VSTRB.U8,
// VSTRH.U16 and VSTRW.U32 all store the vector register in
// exactly the same format, and differ only in the range of
// their immediate offset field and the required alignment.
//
// In particular, VSTRB.U8 can store a vector at byte alignment.
// So at this stage we can simply say that loads/stores of all
// 128-bit wide vector types are permitted at any alignment,
// because we know at least _one_ instruction can manage that.
//
// Later on we might find that some of those loads are better
// generated as VLDRW.U32 if alignment permits, to take
// advantage of the larger immediate range. But for the moment,
// all that matters is that if we don't lower the load then
// _some_ instruction can handle it.
// In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
// VSTRW.U32 all store the vector register in exactly the same format, and
// differ only in the range of their immediate offset field and the required
// alignment. So there is always a store that can be used, regardless of
// actual type.
//
// For big endian, that is not the case. But can still emit a (VSTRB.U8;
// VREV64.8) pair and get the same effect. This will likely be better than
// aligning the vector through the stack.
if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
Ty == MVT::v2f64) {
if (Fast)
*Fast = true;
return true;
} else {
// In big-endian MVE, those instructions aren't so similar
// after all, because they reorder the bytes of the vector
// differently. So this time we can only store a particular
// kind of vector if its alignment is at least the element
// type. And we can't store vectors of i64 or f64 at all
// without having to do some postprocessing, because there's
// no VSTRD.U64.
if (Ty == MVT::v16i8 ||
((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
if (Fast)
*Fast = true;
return true;
}
}
return false;

View File

@ -4820,13 +4820,6 @@ let Predicates = [HasMVEInt, IsLE] in {
defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>;
def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
(v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
(v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
(v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
}
let Predicates = [HasMVEInt, IsBE] in {
@ -4841,6 +4834,41 @@ let Predicates = [HasMVEInt, IsBE] in {
def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
// Other unaligned loads/stores need to go though a VREV
def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)),
(v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)),
(v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)),
(v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)),
(v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)),
(v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)),
(v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
}
let Predicates = [HasMVEInt] in {
def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
(v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
(v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
(v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
}

View File

@ -29,47 +29,14 @@ define void @load_load_add_store_align1(<4 x i32> *%src1, <4 x i32> *%src2) {
;
; CHECK-BE-LABEL: load_load_add_store_align1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r4, r6, r7, lr}
; CHECK-BE-NEXT: push {r4, r6, r7, lr}
; CHECK-BE-NEXT: .setfp r7, sp, #8
; CHECK-BE-NEXT: add r7, sp, #8
; CHECK-BE-NEXT: .pad #48
; CHECK-BE-NEXT: sub sp, #48
; CHECK-BE-NEXT: mov r4, sp
; CHECK-BE-NEXT: bfc r4, #0, #4
; CHECK-BE-NEXT: mov sp, r4
; CHECK-BE-NEXT: ldr.w r12, [r1]
; CHECK-BE-NEXT: ldr r3, [r1, #4]
; CHECK-BE-NEXT: ldr r2, [r1, #8]
; CHECK-BE-NEXT: ldr r1, [r1, #12]
; CHECK-BE-NEXT: strd r2, r1, [sp, #24]
; CHECK-BE-NEXT: mov r1, r0
; CHECK-BE-NEXT: strd r12, r3, [sp, #16]
; CHECK-BE-NEXT: ldr r2, [r1, #4]!
; CHECK-BE-NEXT: str r2, [sp, #4]
; CHECK-BE-NEXT: ldr r2, [r0]
; CHECK-BE-NEXT: str r2, [sp]
; CHECK-BE-NEXT: mov r2, r1
; CHECK-BE-NEXT: ldr r3, [r2, #4]!
; CHECK-BE-NEXT: str r3, [sp, #8]
; CHECK-BE-NEXT: ldr r3, [r2, #4]
; CHECK-BE-NEXT: str r3, [sp, #12]
; CHECK-BE-NEXT: add r3, sp, #16
; CHECK-BE-NEXT: vldrw.u32 q0, [r3]
; CHECK-BE-NEXT: mov r3, sp
; CHECK-BE-NEXT: vldrw.u32 q1, [r3]
; CHECK-BE-NEXT: add r3, sp, #32
; CHECK-BE-NEXT: vldrb.u8 q0, [r1]
; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vrev32.8 q1, q1
; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
; CHECK-BE-NEXT: vstrw.32 q0, [r3]
; CHECK-BE-NEXT: ldrd r3, r4, [sp, #40]
; CHECK-BE-NEXT: ldrd r12, lr, [sp, #32]
; CHECK-BE-NEXT: str r4, [r2, #4]
; CHECK-BE-NEXT: sub.w r4, r7, #8
; CHECK-BE-NEXT: str r3, [r2]
; CHECK-BE-NEXT: str.w lr, [r1]
; CHECK-BE-NEXT: str.w r12, [r0]
; CHECK-BE-NEXT: mov sp, r4
; CHECK-BE-NEXT: pop {r4, r6, r7, pc}
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vstrb.8 q0, [r0]
; CHECK-BE-NEXT: bx lr
entry:
%l1 = load <4 x i32>, <4 x i32>* %src1, align 1
%l2 = load <4 x i32>, <4 x i32>* %src2, align 1

View File

@ -1,72 +1,138 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) {
; CHECK-LABEL: load_4xi32_a4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: load_4xi32_a4:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_4xi32_a4:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
; CHECK-BE-NEXT: vshr.u32 q1, q0, #1
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %vp, align 4
ret <4 x i32> %0
%1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) {
; CHECK-LABEL: load_4xi32_a2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: load_4xi32_a2:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vldrh.u16 q0, [r0]
; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_4xi32_a2:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vldrb.u8 q0, [r0]
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vshr.u32 q1, q0, #1
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %vp, align 2
ret <4 x i32> %0
%1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) {
; CHECK-LABEL: load_4xi32_a1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: load_4xi32_a1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vldrb.u8 q0, [r0]
; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_4xi32_a1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vldrb.u8 q0, [r0]
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vshr.u32 q1, q0, #1
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %vp, align 1
ret <4 x i32> %0
%1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %1
}
define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) {
; CHECK-LABEL: store_4xi32_a4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: store_4xi32_a4:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
; CHECK-LE-NEXT: vstrw.32 q0, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: store_4xi32_a4:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
; CHECK-BE-NEXT: vstrw.32 q0, [r0]
; CHECK-BE-NEXT: bx lr
entry:
store <4 x i32> %val, <4 x i32>* %vp, align 4
%0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
store <4 x i32> %0, <4 x i32>* %vp, align 4
ret void
}
define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) {
; CHECK-LABEL: store_4xi32_a2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vstrh.16 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: store_4xi32_a2:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
; CHECK-LE-NEXT: vstrh.16 q0, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: store_4xi32_a2:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vstrb.8 q0, [r0]
; CHECK-BE-NEXT: bx lr
entry:
store <4 x i32> %val, <4 x i32>* %vp, align 2
%0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
store <4 x i32> %0, <4 x i32>* %vp, align 2
ret void
}
define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) {
; CHECK-LABEL: store_4xi32_a1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vstrb.8 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: store_4xi32_a1:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
; CHECK-LE-NEXT: vstrb.8 q0, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: store_4xi32_a1:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vstrb.8 q0, [r0]
; CHECK-BE-NEXT: bx lr
entry:
store <4 x i32> %val, <4 x i32>* %vp, align 1
%0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
store <4 x i32> %0, <4 x i32>* %vp, align 1
ret void
}
define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) {
; CHECK-LABEL: load_4xi32_a4_offset_pos:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: add.w r0, r0, #508
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: load_4xi32_a4_offset_pos:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: add.w r0, r0, #508
; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_4xi32_a4_offset_pos:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: add.w r0, r0, #508
; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
%ipoffset = getelementptr inbounds i32, i32* %ip, i32 127
%vp = bitcast i32* %ipoffset to <4 x i32>*
@ -75,11 +141,18 @@ entry:
}
define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) {
; CHECK-LABEL: load_4xi32_a4_offset_neg:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: sub.w r0, r0, #508
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: load_4xi32_a4_offset_neg:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: sub.w r0, r0, #508
; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_4xi32_a4_offset_neg:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: sub.w r0, r0, #508
; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
%ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127
%vp = bitcast i32* %ipoffset to <4 x i32>*
@ -88,19 +161,34 @@ entry:
}
define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() {
; CHECK-LABEL: loadstore_4xi32_stack_off16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #40
; CHECK-NEXT: sub sp, #40
; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: movs r0, #3
; CHECK-NEXT: vstrw.32 q0, [sp, #16]
; CHECK-NEXT: str r0, [sp, #16]
; CHECK-NEXT: vldrw.u32 q0, [sp, #16]
; CHECK-NEXT: add sp, #40
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: loadstore_4xi32_stack_off16:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #40
; CHECK-LE-NEXT: sub sp, #40
; CHECK-LE-NEXT: vmov.i32 q0, #0x1
; CHECK-LE-NEXT: mov r0, sp
; CHECK-LE-NEXT: vstrw.32 q0, [r0]
; CHECK-LE-NEXT: movs r0, #3
; CHECK-LE-NEXT: vstrw.32 q0, [sp, #16]
; CHECK-LE-NEXT: str r0, [sp, #16]
; CHECK-LE-NEXT: vldrw.u32 q0, [sp, #16]
; CHECK-LE-NEXT: add sp, #40
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: loadstore_4xi32_stack_off16:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #40
; CHECK-BE-NEXT: sub sp, #40
; CHECK-BE-NEXT: vmov.i32 q0, #0x1
; CHECK-BE-NEXT: mov r0, sp
; CHECK-BE-NEXT: vstrw.32 q0, [r0]
; CHECK-BE-NEXT: movs r0, #3
; CHECK-BE-NEXT: vstrw.32 q0, [sp, #16]
; CHECK-BE-NEXT: str r0, [sp, #16]
; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16]
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: add sp, #40
; CHECK-BE-NEXT: bx lr
entry:
%c = alloca [1 x [5 x [2 x i32]]], align 4
%0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8*
@ -116,19 +204,34 @@ entry:
}
define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() {
; CHECK-LABEL: loadstore_8xi16_stack_off16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #40
; CHECK-NEXT: sub sp, #40
; CHECK-NEXT: vmov.i16 q0, #0x1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstrh.16 q0, [r0]
; CHECK-NEXT: movs r0, #3
; CHECK-NEXT: vstrh.16 q0, [sp, #16]
; CHECK-NEXT: strh.w r0, [sp, #16]
; CHECK-NEXT: vldrh.u16 q0, [sp, #16]
; CHECK-NEXT: add sp, #40
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: loadstore_8xi16_stack_off16:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #40
; CHECK-LE-NEXT: sub sp, #40
; CHECK-LE-NEXT: vmov.i16 q0, #0x1
; CHECK-LE-NEXT: mov r0, sp
; CHECK-LE-NEXT: vstrh.16 q0, [r0]
; CHECK-LE-NEXT: movs r0, #3
; CHECK-LE-NEXT: vstrh.16 q0, [sp, #16]
; CHECK-LE-NEXT: strh.w r0, [sp, #16]
; CHECK-LE-NEXT: vldrh.u16 q0, [sp, #16]
; CHECK-LE-NEXT: add sp, #40
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: loadstore_8xi16_stack_off16:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #40
; CHECK-BE-NEXT: sub sp, #40
; CHECK-BE-NEXT: vmov.i16 q0, #0x1
; CHECK-BE-NEXT: mov r0, sp
; CHECK-BE-NEXT: vstrh.16 q0, [r0]
; CHECK-BE-NEXT: movs r0, #3
; CHECK-BE-NEXT: vstrh.16 q0, [sp, #16]
; CHECK-BE-NEXT: strh.w r0, [sp, #16]
; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16]
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: add sp, #40
; CHECK-BE-NEXT: bx lr
entry:
%c = alloca [1 x [10 x [2 x i16]]], align 2
%0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8*
@ -144,19 +247,34 @@ entry:
}
define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() {
; CHECK-LABEL: loadstore_16xi8_stack_off16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #40
; CHECK-NEXT: sub sp, #40
; CHECK-NEXT: vmov.i8 q0, #0x1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstrb.8 q0, [r0]
; CHECK-NEXT: movs r0, #3
; CHECK-NEXT: vstrb.8 q0, [sp, #16]
; CHECK-NEXT: strb.w r0, [sp, #16]
; CHECK-NEXT: vldrb.u8 q0, [sp, #16]
; CHECK-NEXT: add sp, #40
; CHECK-NEXT: bx lr
; CHECK-LE-LABEL: loadstore_16xi8_stack_off16:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .pad #40
; CHECK-LE-NEXT: sub sp, #40
; CHECK-LE-NEXT: vmov.i8 q0, #0x1
; CHECK-LE-NEXT: mov r0, sp
; CHECK-LE-NEXT: vstrb.8 q0, [r0]
; CHECK-LE-NEXT: movs r0, #3
; CHECK-LE-NEXT: vstrb.8 q0, [sp, #16]
; CHECK-LE-NEXT: strb.w r0, [sp, #16]
; CHECK-LE-NEXT: vldrb.u8 q0, [sp, #16]
; CHECK-LE-NEXT: add sp, #40
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: loadstore_16xi8_stack_off16:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .pad #40
; CHECK-BE-NEXT: sub sp, #40
; CHECK-BE-NEXT: vmov.i8 q0, #0x1
; CHECK-BE-NEXT: mov r0, sp
; CHECK-BE-NEXT: vstrb.8 q0, [r0]
; CHECK-BE-NEXT: movs r0, #3
; CHECK-BE-NEXT: vstrb.8 q0, [sp, #16]
; CHECK-BE-NEXT: strb.w r0, [sp, #16]
; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16]
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: add sp, #40
; CHECK-BE-NEXT: bx lr
entry:
%c = alloca [1 x [20 x [2 x i8]]], align 1
%0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8*

View File

@ -1,81 +1,165 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
declare arm_aapcs_vfpcc <4 x i32> @ext_i32()
declare arm_aapcs_vfpcc <8 x i16> @ext_i16()
declare arm_aapcs_vfpcc <16 x i8> @ext_i8()
declare arm_aapcs_vfpcc <4 x i32> @ext_i32(<4 x i32> %c)
declare arm_aapcs_vfpcc <8 x i16> @ext_i16(<8 x i16> %c)
declare arm_aapcs_vfpcc <16 x i8> @ext_i8(<16 x i8> %c)
define arm_aapcs_vfpcc <4 x i32> @shuffle1_v4i32(<4 x i32> %src, <4 x i32> %a) {
; CHECK-LABEL: shuffle1_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmov q4, q1
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: bl ext_i32
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r7, pc}
; CHECK-LE-LABEL: shuffle1_v4i32:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
; CHECK-LE-NEXT: .vsave {d8, d9}
; CHECK-LE-NEXT: vpush {d8, d9}
; CHECK-LE-NEXT: .pad #8
; CHECK-LE-NEXT: sub sp, #8
; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr
; CHECK-LE-NEXT: vmov.i32 q0, #0x0
; CHECK-LE-NEXT: vpsel q0, q1, q0
; CHECK-LE-NEXT: vmov q4, q1
; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-LE-NEXT: bl ext_i32
; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-LE-NEXT: vpsel q0, q4, q0
; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: vpop {d8, d9}
; CHECK-LE-NEXT: pop {r7, pc}
;
; CHECK-BE-LABEL: shuffle1_v4i32:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
; CHECK-BE-NEXT: .vsave {d8, d9}
; CHECK-BE-NEXT: vpush {d8, d9}
; CHECK-BE-NEXT: .pad #8
; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: vrev64.32 q4, q1
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vpsel q1, q4, q0
; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bl ext_i32
; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vpsel q1, q4, q1
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: vpop {d8, d9}
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%c = icmp eq <4 x i32> %src, zeroinitializer
%ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32()
%s1 = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer
%ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32(<4 x i32> %s1)
%s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %ext
ret <4 x i32> %s
}
define arm_aapcs_vfpcc <8 x i16> @shuffle1_v8i16(<8 x i16> %src, <8 x i16> %a) {
; CHECK-LABEL: shuffle1_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vcmp.i16 eq, q0, zr
; CHECK-NEXT: vmov q4, q1
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: bl ext_i16
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r7, pc}
; CHECK-LE-LABEL: shuffle1_v8i16:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
; CHECK-LE-NEXT: .vsave {d8, d9}
; CHECK-LE-NEXT: vpush {d8, d9}
; CHECK-LE-NEXT: .pad #8
; CHECK-LE-NEXT: sub sp, #8
; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr
; CHECK-LE-NEXT: vmov.i32 q0, #0x0
; CHECK-LE-NEXT: vpsel q0, q1, q0
; CHECK-LE-NEXT: vmov q4, q1
; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-LE-NEXT: bl ext_i16
; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-LE-NEXT: vpsel q0, q4, q0
; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: vpop {d8, d9}
; CHECK-LE-NEXT: pop {r7, pc}
;
; CHECK-BE-LABEL: shuffle1_v8i16:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
; CHECK-BE-NEXT: .vsave {d8, d9}
; CHECK-BE-NEXT: vpush {d8, d9}
; CHECK-BE-NEXT: .pad #8
; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: vrev64.16 q4, q1
; CHECK-BE-NEXT: vmov.i32 q1, #0x0
; CHECK-BE-NEXT: vrev64.16 q2, q0
; CHECK-BE-NEXT: vrev32.16 q1, q1
; CHECK-BE-NEXT: vcmp.i16 eq, q2, zr
; CHECK-BE-NEXT: vpsel q1, q4, q1
; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-BE-NEXT: vrev64.16 q0, q1
; CHECK-BE-NEXT: bl ext_i16
; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vpsel q1, q4, q1
; CHECK-BE-NEXT: vrev64.16 q0, q1
; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: vpop {d8, d9}
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%c = icmp eq <8 x i16> %src, zeroinitializer
%ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16()
%s1 = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer
%ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16(<8 x i16> %s1)
%s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %ext
ret <8 x i16> %s
}
define arm_aapcs_vfpcc <16 x i8> @shuffle1_v16i8(<16 x i8> %src, <16 x i8> %a) {
; CHECK-LABEL: shuffle1_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vcmp.i8 eq, q0, zr
; CHECK-NEXT: vmov q4, q1
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: bl ext_i8
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r7, pc}
; CHECK-LE-LABEL: shuffle1_v16i8:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
; CHECK-LE-NEXT: .vsave {d8, d9}
; CHECK-LE-NEXT: vpush {d8, d9}
; CHECK-LE-NEXT: .pad #8
; CHECK-LE-NEXT: sub sp, #8
; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr
; CHECK-LE-NEXT: vmov.i32 q0, #0x0
; CHECK-LE-NEXT: vpsel q0, q1, q0
; CHECK-LE-NEXT: vmov q4, q1
; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-LE-NEXT: bl ext_i8
; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-LE-NEXT: vpsel q0, q4, q0
; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: vpop {d8, d9}
; CHECK-LE-NEXT: pop {r7, pc}
;
; CHECK-BE-LABEL: shuffle1_v16i8:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
; CHECK-BE-NEXT: .vsave {d8, d9}
; CHECK-BE-NEXT: vpush {d8, d9}
; CHECK-BE-NEXT: .pad #8
; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: vrev64.8 q4, q1
; CHECK-BE-NEXT: vmov.i32 q1, #0x0
; CHECK-BE-NEXT: vrev64.8 q2, q0
; CHECK-BE-NEXT: vrev32.8 q1, q1
; CHECK-BE-NEXT: vcmp.i8 eq, q2, zr
; CHECK-BE-NEXT: vpsel q1, q4, q1
; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: bl ext_i8
; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vpsel q1, q4, q1
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: vpop {d8, d9}
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%c = icmp eq <16 x i8> %src, zeroinitializer
%ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8()
%s1 = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer
%ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8(<16 x i8> %s1)
%s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %ext
ret <16 x i8> %s
}

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) {
; CHECK-LABEL: foo_int8_int32: