[ARM][LowOverheadLoops] Dont ignore VCTP

When expanding the LoopStart, we try to remove the iteration count calculation. However, if part of the calculation was also used to calculate the number of elements we could end up deleting instructions that were required to feed DLSTP/WLSTP. Differential Revision: https://reviews.llvm.org/D73275
2024-11-23 19:23:23 +01:00 · 2020-01-27 10:58:46 +00:00 · 2020-01-27 10:58:46 +00:00 · 15c79be89a
commit 15c79be89a
parent 25a13f02ab
2 changed files with 165 additions and 2 deletions
--- a/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@ -900,8 +900,7 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
      SmallPtrSet<MachineInstr*, 4> Visited;
      SmallPtrSet<MachineInstr*, 4> Remove;
      SmallPtrSet<MachineInstr*, 4> Ignore = { LoLoop.Start, LoLoop.Dec,
-                                               LoLoop.End, LoLoop.VCTP,
-                                               LoLoop.InsertPt };
+                                               LoLoop.End, LoLoop.InsertPt };
      SmallVector<MachineInstr*, 4> Chain = { Def };
      while (!Chain.empty()) {
        MachineInstr *MI = Chain.back();
--- a/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
+
+--- |
+  define hidden arm_aapcs_vfpcc void @dont_ignore_vctp(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 {
+  entry:
+    %mul = shl i32 %blockSize, 1
+    %0 = add i32 %mul, 3
+    %1 = icmp slt i32 %mul, 4
+    %smin = select i1 %1, i32 %mul, i32 4
+    %2 = sub i32 %0, %smin
+    %3 = lshr i32 %2, 2
+    %4 = add nuw nsw i32 %3, 1
+    call void @llvm.set.loop.iterations.i32(i32 %4)
+    br label %do.body
+
+  do.body:                                          ; preds = %do.body, %entry
+    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
+    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
+    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
+    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
+    %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
+    %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
+    %8 = fmul <4 x float> %7, <float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00>
+    %output_cast = bitcast float* %pDst.addr.0 to <4 x float>*
+    tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %8, <4 x float>* %output_cast, i32 4, <4 x i1> %6)
+    %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
+    %add.ptr4 = getelementptr inbounds float, float* %pDst.addr.0, i32 4
+    %sub = add nsw i32 %blkCnt.0, -4
+    %9 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %5, i32 1)
+    %10 = icmp ne i32 %9, 0
+    br i1 %10, label %do.body, label %do.end
+
+  do.end:                                           ; preds = %do.body
+    ret void
+  }
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
+  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+
+...
+---
+name:            dont_ignore_vctp
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:
+  - id:              0
+    value:           '<4 x float> <float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00>'
+    alignment:       16
+    isTargetSpecific: false
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: dont_ignore_vctp
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   renamable $r3, dead $cpsr = tLSLri killed renamable $r2, 1, 14, $noreg
+  ; CHECK:   t2IT 11, 8, implicit-def dead $itstate
+  ; CHECK:   renamable $r2 = tLEApcrel %const.0, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
+  ; CHECK: bb.1.do.body (align 4):
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $q1 = nnan ninf nsz MVE_VLDRWU32 renamable $r0, 0, 0, $noreg
+  ; CHECK:   renamable $q1 = nnan ninf nsz MVE_VMULf32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
+  ; CHECK:   MVE_VSTRWU32 killed renamable $q1, renamable $r1, 0, 0, killed $noreg
+  ; CHECK:   renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 16, 14, $noreg
+  ; CHECK:   renamable $r1, dead $cpsr = nuw tADDi8 killed renamable $r1, 16, 14, $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.do.end:
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  ; CHECK: bb.3 (align 16):
+  ; CHECK:   CONSTPOOL_ENTRY 0, %const.0, 16
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    renamable $r3, dead $cpsr = tLSLri renamable $r2, 1, 14, $noreg
+    renamable $r12 = t2MOVi 4, 14, $noreg, $noreg
+    tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    $r12 = t2LSLri renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate
+    renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg
+    renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg
+    renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
+    renamable $r2 = tLEApcrel %const.0, 14, $noreg
+    renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
+    t2DoLoopStart renamable $lr
+
+  bb.1.do.body (align 4):
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+    MVE_VPST 2, implicit $vpr
+    renamable $q1 = nnan ninf nsz MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr
+    renamable $q1 = nnan ninf nsz MVE_VMULf32 killed renamable $q1, renamable $q0, 1, renamable $vpr, undef renamable $q1
+    MVE_VSTRWU32 killed renamable $q1, renamable $r1, 0, 1, killed renamable $vpr
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 16, 14, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r1, dead $cpsr = nuw tADDi8 killed renamable $r1, 16, 14, $noreg
+    renamable $r3, dead $cpsr = nsw tSUBi8 killed renamable $r3, 4, 14, $noreg
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.do.end:
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+  bb.3 (align 16):
+    CONSTPOOL_ENTRY 0, %const.0, 16
+
+...