1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
David Green 17e932c916 [ARM] Create VMOVRRD from adjacent vector extracts
This adds a combine for extract(x, n); extract(x, n+1)  ->
VMOVRRD(extract x, n/2). This allows two vector lanes to be moved at the
same time in a single instruction, and thanks to the other VMOVRRD folds
we have added recently can help reduce the amount of executed
instructions. Floating point types are very similar, but will include a
bitcast to an integer type.

This also adds a shouldRewriteCopySrc, to prevent copy propagation from
DPR to SPR, which can break as not all DPR regs can be extracted from
directly.  Otherwise the machine verifier is unhappy.

Differential Revision: https://reviews.llvm.org/D100244
2021-04-20 15:15:43 +01:00

1075 lines
40 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r1]
; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%offs.zext = zext <16 x i8> %offs to <16 x i32>
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v8i8_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r12, d0
; CHECK-NEXT: vmov r3, lr, d1
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb.w r12, [r12]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb.w lr, [lr]
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.16 q0[0], r4
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: vmov.16 q0[1], r5
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.16 q0[2], r0
; CHECK-NEXT: vmov.16 q0[3], r1
; CHECK-NEXT: vmov.16 q0[4], r2
; CHECK-NEXT: vmov.16 q0[5], r12
; CHECK-NEXT: vmov.16 q0[6], r3
; CHECK-NEXT: vmov.16 q0[7], lr
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
%offs.zext = zext <8 x i8> %offs to <8 x i32>
%ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
ret <8 x i8> %gather
}
define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v2i8_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrb r2, [r1]
; CHECK-NEXT: vmov.i32 q0, #0xff
; CHECK-NEXT: ldrb r1, [r1, #1]
; CHECK-NEXT: vmov q1[2], q1[0], r2, r1
; CHECK-NEXT: vand q0, q1, q0
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: ldrb r1, [r0, r1]
; CHECK-NEXT: ldrb r0, [r0, r2]
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
; CHECK-NEXT: bx lr
entry:
%offs = load <2 x i8>, <2 x i8>* %offptr, align 1
%offs.zext = zext <2 x i8> %offs to <2 x i32>
%ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext
%gather = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> undef)
ret <2 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrb.s32 q0, [r1]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d5
; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%offs.sext = sext <16 x i8> %offs to <16 x i32>
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrh.s32 q0, [r1, #24]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrh.s32 q0, [r1]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: vldrh.s32 q0, [r1, #16]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d5
; CHECK-NEXT: vldrh.s32 q2, [r1, #8]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i16>, <16 x i16>* %offptr, align 2
%offs.sext = sext <16 x i16> %offs to <16 x i32>
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_scaled:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d5
; CHECK-NEXT: vldrb.u32 q2, [r1, #4]
; CHECK-NEXT: vshl.i32 q2, q2, #2
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 4
%offs.zext = zext <16 x i8> %offs to <16 x i32>
%ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext
%ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*>
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i8_next:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d5
; CHECK-NEXT: vldrw.u32 q2, [r1, #16]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i32>, <16 x i32>* %offptr, align 4
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i8_2gep:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
; CHECK-NEXT: vmov.i32 q2, #0x5
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vadd.i32 q0, q0, q2
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrb.s32 q0, [r1]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vadd.i32 q3, q0, q2
; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vadd.i32 q1, q0, q2
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d6
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d7
; CHECK-NEXT: vldrb.s32 q3, [r1, #4]
; CHECK-NEXT: vadd.i32 q3, q3, r0
; CHECK-NEXT: vadd.i32 q2, q3, q2
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i8_2gep2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: adr r1, .LCPI8_0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI8_0:
; CHECK-NEXT: .byte 5 @ 0x5
; CHECK-NEXT: .byte 8 @ 0x8
; CHECK-NEXT: .byte 11 @ 0xb
; CHECK-NEXT: .byte 14 @ 0xe
; CHECK-NEXT: .byte 17 @ 0x11
; CHECK-NEXT: .byte 20 @ 0x14
; CHECK-NEXT: .byte 23 @ 0x17
; CHECK-NEXT: .byte 26 @ 0x1a
; CHECK-NEXT: .byte 29 @ 0x1d
; CHECK-NEXT: .byte 32 @ 0x20
; CHECK-NEXT: .byte 35 @ 0x23
; CHECK-NEXT: .byte 38 @ 0x26
; CHECK-NEXT: .byte 41 @ 0x29
; CHECK-NEXT: .byte 44 @ 0x2c
; CHECK-NEXT: .byte 47 @ 0x2f
; CHECK-NEXT: .byte 50 @ 0x32
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45>
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep(i8* %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: adr r1, .LCPI9_0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI9_0:
; CHECK-NEXT: .byte 5 @ 0x5
; CHECK-NEXT: .byte 8 @ 0x8
; CHECK-NEXT: .byte 11 @ 0xb
; CHECK-NEXT: .byte 14 @ 0xe
; CHECK-NEXT: .byte 17 @ 0x11
; CHECK-NEXT: .byte 20 @ 0x14
; CHECK-NEXT: .byte 23 @ 0x17
; CHECK-NEXT: .byte 26 @ 0x1a
; CHECK-NEXT: .byte 29 @ 0x1d
; CHECK-NEXT: .byte 32 @ 0x20
; CHECK-NEXT: .byte 35 @ 0x23
; CHECK-NEXT: .byte 38 @ 0x26
; CHECK-NEXT: .byte 41 @ 0x29
; CHECK-NEXT: .byte 44 @ 0x2c
; CHECK-NEXT: .byte 47 @ 0x2f
; CHECK-NEXT: .byte 50 @ 0x32
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 5
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep2(i8* %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: adr r1, .LCPI10_0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI10_0:
; CHECK-NEXT: .byte 0 @ 0x0
; CHECK-NEXT: .byte 3 @ 0x3
; CHECK-NEXT: .byte 6 @ 0x6
; CHECK-NEXT: .byte 9 @ 0x9
; CHECK-NEXT: .byte 12 @ 0xc
; CHECK-NEXT: .byte 15 @ 0xf
; CHECK-NEXT: .byte 18 @ 0x12
; CHECK-NEXT: .byte 21 @ 0x15
; CHECK-NEXT: .byte 24 @ 0x18
; CHECK-NEXT: .byte 27 @ 0x1b
; CHECK-NEXT: .byte 30 @ 0x1e
; CHECK-NEXT: .byte 33 @ 0x21
; CHECK-NEXT: .byte 36 @ 0x24
; CHECK-NEXT: .byte 39 @ 0x27
; CHECK-NEXT: .byte 42 @ 0x2a
; CHECK-NEXT: .byte 45 @ 0x2d
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: adr r1, .LCPI11_0
; CHECK-NEXT: adr r4, .LCPI11_1
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r7, .LCPI11_3
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r1, r2, d1
; CHECK-NEXT: vmov r3, r5, d0
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: ldrb.w lr, [r1]
; CHECK-NEXT: adr r1, .LCPI11_2
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb r1, [r5]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r2, [r6]
; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r5
; CHECK-NEXT: ldrb r5, [r6]
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r5]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r5, d4
; CHECK-NEXT: vmov.8 q0[3], r6
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r5
; CHECK-NEXT: vmov r0, r5, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r5
; CHECK-NEXT: vmov r0, r5, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r5
; CHECK-NEXT: vmov.8 q0[10], r4
; CHECK-NEXT: vmov.8 q0[11], r2
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r1
; CHECK-NEXT: vmov.8 q0[14], lr
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI11_0:
; CHECK-NEXT: .long 292 @ 0x124
; CHECK-NEXT: .long 295 @ 0x127
; CHECK-NEXT: .long 298 @ 0x12a
; CHECK-NEXT: .long 301 @ 0x12d
; CHECK-NEXT: .LCPI11_1:
; CHECK-NEXT: .long 280 @ 0x118
; CHECK-NEXT: .long 283 @ 0x11b
; CHECK-NEXT: .long 286 @ 0x11e
; CHECK-NEXT: .long 289 @ 0x121
; CHECK-NEXT: .LCPI11_2:
; CHECK-NEXT: .long 256 @ 0x100
; CHECK-NEXT: .long 259 @ 0x103
; CHECK-NEXT: .long 262 @ 0x106
; CHECK-NEXT: .long 265 @ 0x109
; CHECK-NEXT: .LCPI11_3:
; CHECK-NEXT: .long 268 @ 0x10c
; CHECK-NEXT: .long 271 @ 0x10f
; CHECK-NEXT: .long 274 @ 0x112
; CHECK-NEXT: .long 277 @ 0x115
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 256
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: adr r1, .LCPI12_0
; CHECK-NEXT: adr r4, .LCPI12_1
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r7, .LCPI12_3
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r1, r2, d1
; CHECK-NEXT: vmov r3, r5, d0
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: ldrb.w lr, [r1]
; CHECK-NEXT: adr r1, .LCPI12_2
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb r1, [r5]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r2, [r6]
; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r5
; CHECK-NEXT: ldrb r5, [r6]
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r5]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r5, d4
; CHECK-NEXT: vmov.8 q0[3], r6
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r5
; CHECK-NEXT: vmov r0, r5, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r5
; CHECK-NEXT: vmov r0, r5, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r5
; CHECK-NEXT: vmov.8 q0[10], r4
; CHECK-NEXT: vmov.8 q0[11], r2
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r1
; CHECK-NEXT: vmov.8 q0[14], lr
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI12_0:
; CHECK-NEXT: .long 36 @ 0x24
; CHECK-NEXT: .long 39 @ 0x27
; CHECK-NEXT: .long 42 @ 0x2a
; CHECK-NEXT: .long 45 @ 0x2d
; CHECK-NEXT: .LCPI12_1:
; CHECK-NEXT: .long 256 @ 0x100
; CHECK-NEXT: .long 27 @ 0x1b
; CHECK-NEXT: .long 30 @ 0x1e
; CHECK-NEXT: .long 33 @ 0x21
; CHECK-NEXT: .LCPI12_2:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 3 @ 0x3
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 9 @ 0x9
; CHECK-NEXT: .LCPI12_3:
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 15 @ 0xf
; CHECK-NEXT: .long 18 @ 0x12
; CHECK-NEXT: .long 21 @ 0x15
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 256, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep5:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.i32 q4, #0x100
; CHECK-NEXT: vadd.i32 q3, q3, q4
; CHECK-NEXT: vadd.i32 q2, q2, q4
; CHECK-NEXT: vmov r3, r2, d7
; CHECK-NEXT: vadd.i32 q1, q1, q4
; CHECK-NEXT: vmov r0, r1, d6
; CHECK-NEXT: vadd.i32 q3, q0, q4
; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r1]
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb r1, [r5]
; CHECK-NEXT: vmov r2, r5, d6
; CHECK-NEXT: ldrb r4, [r0]
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r2
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: vmov r2, r5, d7
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[2], r2
; CHECK-NEXT: vmov.8 q0[3], r5
; CHECK-NEXT: vmov r2, r5, d2
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[4], r2
; CHECK-NEXT: vmov.8 q0[5], r5
; CHECK-NEXT: vmov r2, r5, d3
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[6], r2
; CHECK-NEXT: vmov.8 q0[7], r5
; CHECK-NEXT: vmov r2, r5, d4
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[8], r2
; CHECK-NEXT: vmov.8 q0[9], r5
; CHECK-NEXT: vmov.8 q0[10], r1
; CHECK-NEXT: vmov.8 q0[11], r0
; CHECK-NEXT: vmov.8 q0[12], r4
; CHECK-NEXT: vmov.8 q0[13], r3
; CHECK-NEXT: vmov.8 q0[14], lr
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep6:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: adr r1, .LCPI14_0
; CHECK-NEXT: adr r4, .LCPI14_1
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r7, .LCPI14_3
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r1, r2, d1
; CHECK-NEXT: vmov r3, r5, d0
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: ldrb.w lr, [r1]
; CHECK-NEXT: adr r1, .LCPI14_2
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb r1, [r5]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r2, [r6]
; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r5
; CHECK-NEXT: ldrb r5, [r6]
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r5]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r5, d4
; CHECK-NEXT: vmov.8 q0[3], r6
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r5
; CHECK-NEXT: vmov r0, r5, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r5
; CHECK-NEXT: vmov r0, r5, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r5
; CHECK-NEXT: vmov.8 q0[10], r4
; CHECK-NEXT: vmov.8 q0[11], r2
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r1
; CHECK-NEXT: vmov.8 q0[14], lr
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI14_0:
; CHECK-NEXT: .long 37 @ 0x25
; CHECK-NEXT: .long 40 @ 0x28
; CHECK-NEXT: .long 43 @ 0x2b
; CHECK-NEXT: .long 46 @ 0x2e
; CHECK-NEXT: .LCPI14_1:
; CHECK-NEXT: .long 257 @ 0x101
; CHECK-NEXT: .long 28 @ 0x1c
; CHECK-NEXT: .long 31 @ 0x1f
; CHECK-NEXT: .long 34 @ 0x22
; CHECK-NEXT: .LCPI14_2:
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .LCPI14_3:
; CHECK-NEXT: .long 13 @ 0xd
; CHECK-NEXT: .long 16 @ 0x10
; CHECK-NEXT: .long 19 @ 0x13
; CHECK-NEXT: .long 22 @ 0x16
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 256, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 1
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) {
; CHECK-LABEL: unscaled_v16i8_i8_biggep7:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: adr r1, .LCPI15_0
; CHECK-NEXT: adr r4, .LCPI15_1
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r7, .LCPI15_3
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r1, r2, d1
; CHECK-NEXT: vmov r3, r5, d0
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: ldrb.w lr, [r1]
; CHECK-NEXT: adr r1, .LCPI15_2
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb r1, [r5]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r2, [r6]
; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r5
; CHECK-NEXT: ldrb r5, [r6]
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r5]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r5, d4
; CHECK-NEXT: vmov.8 q0[3], r6
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r5
; CHECK-NEXT: vmov r0, r5, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r5
; CHECK-NEXT: vmov r0, r5, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r5
; CHECK-NEXT: vmov.8 q0[10], r4
; CHECK-NEXT: vmov.8 q0[11], r2
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r1
; CHECK-NEXT: vmov.8 q0[14], lr
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI15_0:
; CHECK-NEXT: .long 236 @ 0xec
; CHECK-NEXT: .long 239 @ 0xef
; CHECK-NEXT: .long 242 @ 0xf2
; CHECK-NEXT: .long 245 @ 0xf5
; CHECK-NEXT: .LCPI15_1:
; CHECK-NEXT: .long 224 @ 0xe0
; CHECK-NEXT: .long 227 @ 0xe3
; CHECK-NEXT: .long 230 @ 0xe6
; CHECK-NEXT: .long 233 @ 0xe9
; CHECK-NEXT: .LCPI15_2:
; CHECK-NEXT: .long 300 @ 0x12c
; CHECK-NEXT: .long 203 @ 0xcb
; CHECK-NEXT: .long 206 @ 0xce
; CHECK-NEXT: .long 209 @ 0xd1
; CHECK-NEXT: .LCPI15_3:
; CHECK-NEXT: .long 212 @ 0xd4
; CHECK-NEXT: .long 215 @ 0xd7
; CHECK-NEXT: .long 218 @ 0xda
; CHECK-NEXT: .long 221 @ 0xdd
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 100, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
%ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 200
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i8_2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrb.s32 q0, [r1]
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d5
; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_3(i8* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_i8_3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: adr r1, .LCPI17_0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI17_0:
; CHECK-NEXT: .byte 0 @ 0x0
; CHECK-NEXT: .byte 3 @ 0x3
; CHECK-NEXT: .byte 6 @ 0x6
; CHECK-NEXT: .byte 9 @ 0x9
; CHECK-NEXT: .byte 12 @ 0xc
; CHECK-NEXT: .byte 15 @ 0xf
; CHECK-NEXT: .byte 18 @ 0x12
; CHECK-NEXT: .byte 21 @ 0x15
; CHECK-NEXT: .byte 24 @ 0x18
; CHECK-NEXT: .byte 27 @ 0x1b
; CHECK-NEXT: .byte 30 @ 0x1e
; CHECK-NEXT: .byte 33 @ 0x21
; CHECK-NEXT: .byte 36 @ 0x24
; CHECK-NEXT: .byte 39 @ 0x27
; CHECK-NEXT: .byte 42 @ 0x2a
; CHECK-NEXT: .byte 45 @ 0x2d
entry:
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45>
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) {
; CHECK-LABEL: unscaled_v16i8_basei16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q1, q0, r0
; CHECK-NEXT: ldrb.w r12, [r2]
; CHECK-NEXT: ldrb.w lr, [r3]
; CHECK-NEXT: ldrb r3, [r4]
; CHECK-NEXT: ldrb r2, [r5]
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q0[0], r4
; CHECK-NEXT: vmov r4, r6, d3
; CHECK-NEXT: vmov.8 q0[1], r5
; CHECK-NEXT: ldrb r5, [r4]
; CHECK-NEXT: ldrb r4, [r6]
; CHECK-NEXT: vmov r6, r7, d5
; CHECK-NEXT: vldrb.u32 q2, [r1, #4]
; CHECK-NEXT: vshl.i32 q2, q2, #1
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: ldrb r0, [r6]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov.8 q0[3], r7
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.8 q0[7], r1
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.8 q0[9], r1
; CHECK-NEXT: vmov.8 q0[10], r5
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], r12
; CHECK-NEXT: vmov.8 q0[15], lr
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%offs.zext = zext <16 x i8> %offs to <16 x i32>
%ptrs = getelementptr inbounds i16, i16* %base, <16 x i32> %offs.zext
%ptrs.cast = bitcast <16 x i16*> %ptrs to <16 x i8*>
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs.cast, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
ret <16 x i8> %gather
}
declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>)