[ARM] ParallelDSP: multiple reduction stmts in loop

This fixes an issue that we were not properly supporting multiple reduction stmts in a loop, and not generating SMLADs for these cases. The alias analysis checks were done too early, making it too conservative. Differential revision: https://reviews.llvm.org/D49125 llvm-svn: 336795
2025-01-31 20:51:52 +01:00 · 2018-07-11 12:36:25 +00:00 · 2018-07-11 12:36:25 +00:00 · 2dddfdaff5
commit 2dddfdaff5
parent cc7ac3cc07
2 changed files with 151 additions and 41 deletions
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@ -11,6 +11,7 @@
 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
 /// purpose of this pass is do some IR pattern matching to create ACLE
 /// DSP intrinsics, which map on these 32-bit SIMD operations.
+/// This pass runs only when unaligned accesses is supported/enabled.
 //
 //===----------------------------------------------------------------------===//

@ -64,7 +65,16 @@ namespace {
    MemInstList  VecLd;     // List of all load instructions of this Mul
    MemLocList   MemLocs;   // All memory locations read by this Mul

-    ParallelMAC(Instruction *I, ValueList &V) : Mul(I), VL(V) {};
+    // The MAC-chains we currently recognise are simple chains that accumulate
+    // their results with a reducing integer add statement, and consist of
+    // a chain of adds and muls, which have only sext and load instructions as
+    // operands. Thus, these chains don't write memory. We check that this is
+    // true when we collect the operands, and use this in alias analysis checks
+    // that different parallel MACs don't interfere with each other.
+    bool ReadOnly;
+
+    ParallelMAC(Instruction *I, ValueList &V, bool RdOnly)
+      : Mul(I), VL(V), ReadOnly(RdOnly) {};
  };

  struct Reduction {
@ -73,6 +83,8 @@ namespace {
    Instruction     *AccIntAdd;       // The accumulating integer add statement,
                                      // i.e, the reduction statement.

+    ParallelMACList MACCandidates;    // The MAC candidates associated with
+                                      // this reduction statement.
    Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
  };

@ -380,8 +392,10 @@ static ReductionList MatchReductions(Function &F, Loop *TheLoop,
  const BasicBlock *Latch = TheLoop->getLoopLatch();

  // We need a preheader as getIncomingValueForBlock assumes there is one.
-  if (!TheLoop->getLoopPreheader())
+  if (!TheLoop->getLoopPreheader()) {
+    LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
    return Reductions;
+  }

  for (PHINode &Phi : Header->phis()) {
    const auto *Ty = Phi.getType();
@ -412,7 +426,7 @@ static ReductionList MatchReductions(Function &F, Loop *TheLoop,
  return Reductions;
 }

-static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc,
+static void AddMACCandidate(ParallelMACList &Candidates, const Instruction *Acc,
                            Value *MulOp0, Value *MulOp1, int MulOpNum) {
  Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
@ -420,7 +434,15 @@ static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc,
  if (IsNarrowSequence<16>(MulOp0, VL) &&
      IsNarrowSequence<16>(MulOp1, VL)) {
    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
-    Candidates.push_back(ParallelMAC(Mul, VL));
+
+    bool MayWriteMem = false;
+    for (auto &V : VL) {
+      if (dyn_cast<Instruction>(V)->mayWriteToMemory()) {
+        MayWriteMem = true;
+        break;
+      }
+    }
+    Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem));
  }
 }

@ -433,20 +455,20 @@ static ParallelMACList MatchParallelMACs(Reduction &R) {
  // Pattern 1: the accumulator is the RHS of the mul.
  while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
                         m_Value(A)))){
-    AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
+    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
    Acc = dyn_cast<Instruction>(A);
  }
  // Pattern 2: the accumulator is the LHS of the mul.
  while(match(Acc, m_Add(m_Value(A),
                         m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
-    AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 1);
+    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1);
    Acc = dyn_cast<Instruction>(A);
  }

  // The last mul in the chain has a slightly different pattern:
  // the mul is the first operand
  if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
-    AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
+    AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);

  // Because we start at the bottom of the chain, and we work our way up,
  // the muls are added in reverse program order to the list.
@ -456,35 +478,35 @@ static ParallelMACList MatchParallelMACs(Reduction &R) {

 // Collects all instructions that are not part of the MAC chains, which is the
 // set of instructions that can potentially alias with the MAC operands.
-static Instructions AliasCandidates(BasicBlock *Header,
-                                    ParallelMACList &MACCandidates) {
-  Instructions Aliases;
-  auto IsMACCandidate = [] (Instruction *I, ParallelMACList &MACCandidates) {
-    for (auto &MAC : MACCandidates)
-      for (auto *Val : MAC.VL)
-        if (I == MAC.Mul || Val == I)
-          return true;
-   return false;
-  };
-
-  std::for_each(Header->begin(), Header->end(),
-                [&Aliases, &MACCandidates, &IsMACCandidate] (Instruction &I) {
-                  if (I.mayReadOrWriteMemory() &&
-                      !IsMACCandidate(&I, MACCandidates))
-                    Aliases.push_back(&I); });
-  return Aliases;
+static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
+                            Instructions &Writes) {
+  for (auto &I : *Header) {
+    if (I.mayReadFromMemory())
+      Reads.push_back(&I);
+    if (I.mayWriteToMemory())
+      Writes.push_back(&I);
+  }
 }

-// This compares all instructions from the "alias candidates" set, i.e., all
-// instructions that are not part of the MAC-chain, with all instructions in
-// the MAC candidate set, to see if instructions are aliased.
-static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates,
-                       ParallelMACList &MACCandidates) {
+// Check whether statements in the basic block that write to memory alias with
+// the memory locations accessed by the MAC-chains.
+// TODO: we need the read statements when we accept more complicated chains.
+static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
+                       Instructions &Writes, ParallelMACList &MACCandidates) {
  LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto *I : AliasCandidates) {
-    LLVM_DEBUG(dbgs() << "- "; I->dump());
-    for (auto &MAC : MACCandidates) {
-      LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
+  for (auto &MAC : MACCandidates) {
+    LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
+
+    // At the moment, we allow only simple chains that only consist of reads,
+    // accumulate their result with an integer add, and thus that don't write
+    // memory, and simply bail if they do.
+    if (!MAC.ReadOnly)
+      return true;
+
+    // Now for all writes in the basic block, check that they don't alias with
+    // the memory locations accessed by our MAC-chain:
+    for (auto *I : Writes) {
+      LLVM_DEBUG(dbgs() << "- "; I->dump());
      assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
      for (auto &MemLoc : MAC.MemLocs) {
        if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
@ -495,6 +517,7 @@ static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates,
      }
    }
  }
+
  LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
  return false;
 }
@ -554,8 +577,6 @@ static bool SetMemoryLocations(ParallelMACList &Candidates) {
 // If loop invariants are used instead of loads, these need to be packed
 // before the loop begins.
 //
-// Can only be enabled for cores which support unaligned accesses.
-//
 bool ARMParallelDSP::MatchSMLAD(Function &F) {
  BasicBlock *Header = L->getHeader();
  LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
@ -569,11 +590,25 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
    ParallelMACList MACCandidates = MatchParallelMACs(R);
    if (!SetMemoryLocations(MACCandidates))
      continue;
-    Instructions Aliases = AliasCandidates(Header, MACCandidates);
-    if (AreAliased(AA, Aliases, MACCandidates))
-      continue;
-    PMACPairList PMACPairs = CreateParallelMACPairs(MACCandidates);
-    Changed = InsertParallelMACs(R, PMACPairs) || Changed;
+    R.MACCandidates = MACCandidates;
+
+    LLVM_DEBUG(dbgs() << "MAC candidates:\n";
+      for (auto &M : R.MACCandidates)
+        M.Mul->dump();
+      dbgs() << "\n";);
+  }
+
+  // Collect all instructions that may read or write memory. Our alias
+  // analysis checks bail out if any of these instructions aliases with an
+  // instruction from the MAC-chain.
+  Instructions Reads, Writes;
+  AliasCandidates(Header, Reads, Writes);
+
+  for (auto &R : Reductions) {
+    if (AreAliased(AA, Reads, Writes, R.MACCandidates))
+      return false;
+    PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
+    Changed |= InsertParallelMACs(R, PMACPairs);
  }

  LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
--- a/test/CodeGen/ARM/smlad0.ll
+++ b/test/CodeGen/ARM/smlad0.ll
@ -5,17 +5,20 @@
 ;
 ; Check DSP extension:
 ; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 -mattr=-dsp < %s -arm-parallel-dsp -S | FileCheck %s --check-prefix=CHECK-UNSUPPORTED
+
+define dso_local i32 @OneReduction(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
 ;
+; CHECK-LABEL: @OneReduction
 ; CHECK:  %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
 ; CHECK:  [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32*
 ; CHECK:  [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2
 ; CHECK:  [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32*
 ; CHECK:  [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2
 ; CHECK:  [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026)
+; CHECK-NOT: call i32 @llvm.arm.smlad
 ;
 ; CHECK-UNSUPPORTED-NOT:  call i32 @llvm.arm.smlad
 ;
-define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
 entry:
  %cmp24 = icmp sgt i32 %arg, 0
  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
@ -30,7 +33,9 @@ for.cond.cleanup:
  ret i32 %mac1.0.lcssa

 for.body:
+; One reduction statement here:
  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+
  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
  %0 = load i16, i16* %arrayidx, align 2
@ -55,3 +60,73 @@ for.body:
  %exitcond = icmp ne i32 %add, %arg
  br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
+
+define dso_local arm_aapcs_vfpcc i32 @TwoReductions(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+;
+; CHECK-LABEL: @TwoReductions
+;
+; CHECK:  %mac1{{\.}}058 = phi i32 [ [[V10:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
+; CHECK:  %mac2{{\.}}057 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ]
+; CHECK:  [[V10]] = call i32 @llvm.arm.smlad(i32 %{{.*}}, i32 %{{.*}}, i32 %mac1{{\.}}058)
+; CHECK:  [[V17]] = call i32 @llvm.arm.smlad(i32 %{{.*}}, i32 %{{.*}}, i32 %mac2{{\.}}057)
+; CHECK-NOT: call i32 @llvm.arm.smlad
+;
+entry:
+  %cmp55 = icmp sgt i32 %arg, 0
+  br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
+
+for.cond.cleanup:
+  %mac2.0.lcssa = phi i32 [ 0, %entry ], [ %add28, %for.body ]
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.body ]
+  %add30 = add nsw i32 %mac1.0.lcssa, %mac2.0.lcssa
+  ret i32 %add30
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+; And two reduction statements here:
+  %mac1.058 = phi i32 [ %add16, %for.body ], [ 0, %for.body.preheader ]
+  %mac2.057 = phi i32 [ %add28, %for.body ], [ 0, %for.body.preheader ]
+
+  %i.056 = phi i32 [ %add29, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.056
+  %0 = load i16, i16* %arrayidx, align 2
+  %add1 = or i32 %i.056, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %arg3, i32 %add1
+  %1 = load i16, i16* %arrayidx2, align 2
+  %add3 = or i32 %i.056, 2
+  %arrayidx4 = getelementptr inbounds i16, i16* %arg3, i32 %add3
+  %2 = load i16, i16* %arrayidx4, align 2
+
+  %add5 = or i32 %i.056, 3
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg3, i32 %add5
+  %3 = load i16, i16* %arrayidx6, align 2
+  %arrayidx8 = getelementptr inbounds i16, i16* %arg2, i32 %i.056
+  %4 = load i16, i16* %arrayidx8, align 2
+  %conv = sext i16 %4 to i32
+  %conv9 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv9
+  %arrayidx11 = getelementptr inbounds i16, i16* %arg2, i32 %add1
+  %5 = load i16, i16* %arrayidx11, align 2
+  %conv12 = sext i16 %5 to i32
+  %conv13 = sext i16 %1 to i32
+  %mul14 = mul nsw i32 %conv12, %conv13
+  %add15 = add i32 %mul, %mac1.058
+  %add16 = add i32 %add15, %mul14
+  %arrayidx18 = getelementptr inbounds i16, i16* %arg2, i32 %add3
+  %6 = load i16, i16* %arrayidx18, align 2
+  %conv19 = sext i16 %6 to i32
+  %conv20 = sext i16 %2 to i32
+  %mul21 = mul nsw i32 %conv19, %conv20
+  %arrayidx23 = getelementptr inbounds i16, i16* %arg2, i32 %add5
+  %7 = load i16, i16* %arrayidx23, align 2
+  %conv24 = sext i16 %7 to i32
+  %conv25 = sext i16 %3 to i32
+  %mul26 = mul nsw i32 %conv24, %conv25
+  %add27 = add i32 %mul21, %mac2.057
+  %add28 = add i32 %add27, %mul26
+  %add29 = add nuw nsw i32 %i.056, 4
+  %cmp = icmp slt i32 %add29, %arg
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}