[LV] Parallel annotated loop does not imply all loads can be hoisted.

As noted in https://bugs.llvm.org/show_bug.cgi?id=46666, the current behavior of assuming if-conversion safety if a loop is annotated parallel (`!llvm.loop.parallel_accesses`), is not expectable, the documentation for this behavior was since removed from the LangRef again, and can lead to invalid reads. This was observed in POCL (https://github.com/pocl/pocl/issues/757) and would require similar workarounds in current work at hipSYCL. The question remains why this was initially added and what the implications of removing this optimization would be. Do we need an alternative mechanism to propagate the information about legality of if-conversion? Or is the idea that conditional loads in `#pragma clang loop vectorize(assume_safety)` can be executed unmasked without additional checks flawed in general? I think this implication is not part of what a user of that pragma (and corresponding metadata) would expect and thus dangerous. Only two additional tests failed, which are adapted in this patch. Depending on the further direction force-ifcvt.ll should be removed or further adapted. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D103907
2024-11-25 04:02:41 +01:00 · 2021-06-08 18:16:08 +02:00 · 2021-06-08 18:16:08 +02:00 · 6b73f118b0
commit 6b73f118b0
parent 2425855981
4 changed files with 10 additions and 65 deletions
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@ -435,22 +435,17 @@ private:
  bool canVectorizeOuterLoop();

  /// Return true if all of the instructions in the block can be speculatively
-  /// executed, and record the loads/stores that require masking. If's that
-  /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
-  /// is true. This can happen when we introduces guards for which the original
-  /// "unguarded-loads are safe" assumption does not hold. For example, the
-  /// vectorizer's fold-tail transformation changes the loop to execute beyond
-  /// its original trip-count, under a proper guard, which should be preserved.
+  /// executed, and record the loads/stores that require masking.
  /// \p SafePtrs is a list of addresses that are known to be legal and we know
  /// that we can read from them without segfault.
  /// \p MaskedOp is a list of instructions that have to be transformed into
  /// calls to the appropriate masked intrinsic when the loop is vectorized.
  /// \p ConditionalAssumes is a list of assume instructions in predicated
  /// blocks that must be dropped if the CFG gets flattened.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
-                            SmallPtrSetImpl<const Instruction *> &MaskedOp,
-                            SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
-                            bool PreserveGuards = false) const;
+  bool blockCanBePredicated(
+      BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+      SmallPtrSetImpl<const Instruction *> &MaskedOp,
+      SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const;

  /// Updates the vectorization state by adding \p Phi to the inductions list.
  /// This can set \p Phi as the main induction of the loop if \p Phi is a
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@ -955,10 +955,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
 bool LoopVectorizationLegality::blockCanBePredicated(
    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
    SmallPtrSetImpl<const Instruction *> &MaskedOp,
-    SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
-    bool PreserveGuards) const {
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-
+    SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
  for (Instruction &I : *BB) {
    // Check that we don't have a constant expression that can trap as operand.
    for (Value *Operand : I.operands()) {
@ -986,11 +983,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
      if (!LI)
        return false;
      if (!SafePtrs.count(LI->getPointerOperand())) {
-        // !llvm.mem.parallel_loop_access implies if-conversion safety.
-        // Otherwise, record that the load needs (real or emulated) masking
-        // and let the cost model decide.
-        if (!IsAnnotatedParallel || PreserveGuards)
-          MaskedOp.insert(LI);
+        MaskedOp.insert(LI);
        continue;
      }
    }
@ -1306,8 +1299,7 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
  // do not need predication such as the header block.
  for (BasicBlock *BB : TheLoop->blocks()) {
    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
-                              TmpConditionalAssumes,
-                              /* MaskAllLoads= */ true)) {
+                              TmpConditionalAssumes)) {
      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
      return false;
    }
--- a/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
+++ b/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
@ -1,42 +0,0 @@
-; RUN: opt -loop-vectorize -S < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
-entry:
-  br label %for.body
-
-; CHECK-LABEL: @Test
-; CHECK: <4 x i32>
-
-for.body:                                         ; preds = %cond.end, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
-  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1
-  %cmp1 = icmp eq i32 %0, 0
-  %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1
-  br i1 %cmp1, label %cond.end, label %cond.false
-
-cond.false:                                       ; preds = %for.body
-  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
-  %2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1
-  %add = add nsw i32 %2, %1
-  br label %cond.end
-
-cond.end:                                         ; preds = %for.body, %cond.false
-  %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
-  store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 16
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
-
-for.end:                                          ; preds = %cond.end
-  ret void
-}
-
-attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
-
-!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
-!1 = distinct !{}
--- a/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
+++ b/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@ -51,7 +51,7 @@ for.inc:
  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
 }

-; Case2: With pragma assume_safety only the store is masked.
+; Case2: With pragma assume_safety both, load and store are masked.
 ; void assume_safety(int * p, int * q1, int * q2, int guard) {
 ;   #pragma clang loop vectorize(assume_safety)
 ;   for(int ix=0; ix < 1021; ++ix) {
@ -63,7 +63,7 @@ for.inc:

 ;CHECK-LABEL: @assume_safety
 ;CHECK: vector.body:
-;CHECK-NOT: @llvm.masked.load
+;CHECK:  call <8 x i32> @llvm.masked.load
 ;CHECK:  call void @llvm.masked.store

 ; Function Attrs: norecurse nounwind uwtable