1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[ARM] Ensure instructions are simplified prior to GatherScatter lowering.

Surprisingly, not all instructions are always simplified after unrolling
and before MVE gather/scatter lowering. Notably dead gather operations
can be left around which cause the gather/scatter lowering pass to crash
if there are multiple gathers, some of which are dead.

This patch ensures they are simplified before we modify anything, which
can change some of the existing tests, including making them no-longer
test what they originally tested. This uses a combination of disabling
the gather/scatter lowering pass and adjusting the test to keep them as
before.

Differential Revision: https://reviews.llvm.org/D103150
This commit is contained in:
David Green 2021-06-10 20:18:12 +01:00
parent 2afb636ed1
commit a3bd30b5e4
9 changed files with 94 additions and 50 deletions

View File

@ -1167,6 +1167,8 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
bool Changed = false;
for (BasicBlock &BB : F) {
SimplifyInstructionsInBlock(&BB);
for (Instruction &I : BB) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s
define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) {
; CHECK-LABEL: remat_vctp:

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
define void @_Z4loopPxS_iS_i(i64* %d) {
; CHECK-LABEL: _Z4loopPxS_iS_i:

View File

@ -321,26 +321,29 @@ end:
ret void;
}
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
; CHECK-LABEL: non_gatscat_use1:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r3, .LCPI7_0
; CHECK-NEXT: vmov.i32 q0, #0x8
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: adr.w r12, .LCPI7_0
; CHECK-NEXT: vmov.i32 q0, #0x9
; CHECK-NEXT: vldrw.u32 q3, [r12]
; CHECK-NEXT: vmov.i32 q1, #0xc
; CHECK-NEXT: vmov.i32 q2, #0x8
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q3, q2, q0
; CHECK-NEXT: vmlas.u32 q2, q1, r0
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
; CHECK-NEXT: vadd.i32 q4, q3, q2
; CHECK-NEXT: vmul.i32 q5, q3, q0
; CHECK-NEXT: vmlas.u32 q3, q1, r0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: vldrw.u32 q6, [q3, #24]
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vstrw.32 q5, [r3]
; CHECK-NEXT: vstrb.8 q6, [r1], #16
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
@ -364,6 +367,7 @@ vector.body: ; preds = %vector.body, %vecto
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
@ -373,26 +377,31 @@ end:
ret void;
}
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
; CHECK-LABEL: non_gatscat_use2:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vmov.i32 q0, #0x8
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: vmov.i32 q1, #0xc
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: adr.w r12, .LCPI8_0
; CHECK-NEXT: vmov.i32 q0, #0x12
; CHECK-NEXT: vldrw.u32 q4, [r12]
; CHECK-NEXT: vmov.i32 q1, #0x9
; CHECK-NEXT: vmov.i32 q2, #0x8
; CHECK-NEXT: vmov.i32 q3, #0xc
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q3, q2, q0
; CHECK-NEXT: vmlas.u32 q2, q1, r0
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
; CHECK-NEXT: vadd.i32 q5, q4, q2
; CHECK-NEXT: vmul.i32 q6, q4, q1
; CHECK-NEXT: vmlas.u32 q4, q3, r0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [q4, #24]
; CHECK-NEXT: vadd.i32 q4, q6, q0
; CHECK-NEXT: vstrw.32 q4, [r3]
; CHECK-NEXT: vmov q4, q5
; CHECK-NEXT: vstrb.8 q7, [r1], #16
; CHECK-NEXT: bne .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
@ -416,6 +425,7 @@ vector.body: ; preds = %vector.body, %vecto
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
@ -849,12 +859,12 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: add.w r8, r7, #10
; CHECK-NEXT: adr r7, .LCPI11_0
; CHECK-NEXT: ldr r1, [sp, #96]
; CHECK-NEXT: vdup.32 q1, r2
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: vldrw.u32 q1, [r7]
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: mov.w r9, #6
; CHECK-NEXT: movs r6, #11
; CHECK-NEXT: vshl.i32 q1, q1, #2
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@ -889,10 +899,10 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: mul r4, r11, r6
; CHECK-NEXT: vdup.32 q3, r5
; CHECK-NEXT: vdup.32 q2, r7
; CHECK-NEXT: vadd.i32 q4, q0, r4
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: vmla.u32 q3, q4, r2
; CHECK-NEXT: adds r4, #113
; CHECK-NEXT: vadd.i32 q4, q0, r4
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: mov r4, r8
; CHECK-NEXT: vmla.u32 q2, q4, r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
@ -902,8 +912,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
; CHECK-NEXT: vadd.i32 q5, q2, q1
; CHECK-NEXT: vadd.i32 q4, q3, q1
; CHECK-NEXT: vadd.i32 q5, q2, q0
; CHECK-NEXT: vadd.i32 q4, q3, q0
; CHECK-NEXT: subs r4, #4
; CHECK-NEXT: vadd.i32 q2, q6, r2
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]

View File

@ -0,0 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
; This files has some unused gathers, making sure that they do not cause
; problems as the function gets simplified.
define arm_aapcs_vfpcc void @unused1(<4 x i32*> %offs) {
; CHECK-LABEL: unused1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
ret void
}
define arm_aapcs_vfpcc void @unused2(<4 x i32*> %offs) {
; CHECK-LABEL: unused2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
ret void
}
define arm_aapcs_vfpcc void @unused2_used(<4 x i32*> %offs) {
; CHECK-LABEL: unused2_used:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%unused = add <4 x i32> %gather1, %gather2
ret void
}
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.

View File

@ -170,8 +170,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: cmpugez_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
define arm_aapcs_vfpcc <4 x i32> @test_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) {
; CHECK-LABEL: test_v4i32:

View File

@ -70,8 +70,6 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
; CHECK-LABEL: vqdmulh_i16_c:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov.u16 r1, q0[0]
@ -86,41 +84,37 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
; CHECK-NEXT: vmov.u16 r1, q1[1]
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmullb.s16 q0, q3, q0
; CHECK-NEXT: vmov.i32 q3, #0x7fff
; CHECK-NEXT: vshl.i32 q0, q0, #10
; CHECK-NEXT: vshr.s32 q0, q0, #10
; CHECK-NEXT: vshr.s32 q0, q0, #15
; CHECK-NEXT: vmin.s32 q4, q0, q3
; CHECK-NEXT: vmov r0, r1, d8
; CHECK-NEXT: vshr.s32 q3, q0, #15
; CHECK-NEXT: vmov r0, r1, d6
; CHECK-NEXT: vmov.16 q0[0], r0
; CHECK-NEXT: vmov.16 q0[1], r1
; CHECK-NEXT: vmov r0, r1, d9
; CHECK-NEXT: vmov r0, r1, d7
; CHECK-NEXT: vmov.16 q0[2], r0
; CHECK-NEXT: vmov.u16 r0, q2[6]
; CHECK-NEXT: vmov.16 q0[3], r1
; CHECK-NEXT: vmov.u16 r1, q2[4]
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[7]
; CHECK-NEXT: vmov.u16 r1, q2[5]
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[6]
; CHECK-NEXT: vmov.u16 r1, q1[4]
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[7]
; CHECK-NEXT: vmov.u16 r1, q1[5]
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
; CHECK-NEXT: vmullb.s16 q1, q2, q4
; CHECK-NEXT: vmullb.s16 q1, q2, q3
; CHECK-NEXT: vshl.i32 q1, q1, #10
; CHECK-NEXT: vshr.s32 q1, q1, #10
; CHECK-NEXT: vshr.s32 q1, q1, #15
; CHECK-NEXT: vmin.s32 q1, q1, q3
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmov.16 q0[4], r0
; CHECK-NEXT: vmov.16 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov.16 q0[6], r0
; CHECK-NEXT: vmov.16 q0[7], r1
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%l2 = sext <8 x i16> %s0 to <8 x i22>