mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
17e932c916
This adds a combine for extract(x, n); extract(x, n+1) -> VMOVRRD(extract x, n/2). This allows two vector lanes to be moved at the same time in a single instruction, and thanks to the other VMOVRRD folds we have added recently can help reduce the amount of executed instructions. Floating point types are very similar, but will include a bitcast to an integer type. This also adds a shouldRewriteCopySrc, to prevent copy propagation from DPR to SPR, which can break as not all DPR regs can be extracted from directly. Otherwise the machine verifier is unhappy. Differential Revision: https://reviews.llvm.org/D100244
94 lines
3.9 KiB
LLVM
94 lines
3.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv7s-none-eabi %s -o - | FileCheck %s
|
|
|
|
declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffle.i27.i308, <8 x i8> %vtbl2.i25.i)
|
|
|
|
; Check that we get the motivating example:
|
|
; The bitcasts force the values to go through the GPRs, whereas
|
|
; they are defined on VPRs and used on VPRs.
|
|
;
|
|
define void @motivatingExample(<2 x i64>* %addr, <8 x i8>* %addr2) {
|
|
; CHECK-LABEL: motivatingExample:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vldr d18, [r1]
|
|
; CHECK-NEXT: vtbl.8 d16, {d16, d17}, d18
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: bx lr
|
|
%shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
|
|
%vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
|
|
%shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
|
|
%shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
|
|
%tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
|
|
%tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
|
|
%vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
|
|
store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
|
|
ret void
|
|
}
|
|
|
|
; Check that we do not perform the transformation for dynamic index.
|
|
define void @dynamicIndex(<2 x i64>* %addr, <8 x i8>* %addr2, i32 %index) {
|
|
; CHECK-LABEL: dynamicIndex:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: .save {r4, r6, r7, lr}
|
|
; CHECK-NEXT: push {r4, r6, r7, lr}
|
|
; CHECK-NEXT: .setfp r7, sp, #8
|
|
; CHECK-NEXT: add r7, sp, #8
|
|
; CHECK-NEXT: .pad #16
|
|
; CHECK-NEXT: sub sp, #16
|
|
; CHECK-NEXT: mov r4, sp
|
|
; CHECK-NEXT: bfc r4, #0, #4
|
|
; CHECK-NEXT: mov sp, r4
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: adds r0, r2, r2
|
|
; CHECK-NEXT: and r2, r0, #3
|
|
; CHECK-NEXT: adds r0, #1
|
|
; CHECK-NEXT: mov r12, sp
|
|
; CHECK-NEXT: and r0, r0, #3
|
|
; CHECK-NEXT: lsls r2, r2, #2
|
|
; CHECK-NEXT: mov r3, r12
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128], r2
|
|
; CHECK-NEXT: orr.w r0, r12, r0, lsl #2
|
|
; CHECK-NEXT: sub.w r4, r7, #8
|
|
; CHECK-NEXT: ldr r2, [r3]
|
|
; CHECK-NEXT: ldr r0, [r0]
|
|
; CHECK-NEXT: vldr d18, [r1]
|
|
; CHECK-NEXT: vmov d16, r2, r0
|
|
; CHECK-NEXT: vtbl.8 d16, {d16, d17}, d18
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov sp, r4
|
|
; CHECK-NEXT: pop {r4, r6, r7, pc}
|
|
%shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
|
|
%vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
|
|
%shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 %index
|
|
%shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
|
|
%tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
|
|
%tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
|
|
%vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
|
|
store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
|
|
ret void
|
|
}
|
|
|
|
; Check that we do not perform the transformation when there are several uses
|
|
; of the result of the bitcast.
|
|
define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) {
|
|
; CHECK-LABEL: severalUses:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vmov r0, r2, d16
|
|
; CHECK-NEXT: vldr d18, [r1]
|
|
; CHECK-NEXT: vtbl.8 d16, {d16, d17}, d18
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov r1, r2
|
|
; CHECK-NEXT: bx lr
|
|
%shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
|
|
%vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
|
|
%shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
|
|
%shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
|
|
%tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
|
|
%tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
|
|
%vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
|
|
store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
|
|
ret i64 %shuffle.i.extract.i310
|
|
}
|