Teach LSR how to cope better with unrolled loops on targets where

the addressing modes don't make this trivially easy. This allows it to avoid falling into the less precise heuristics in more cases. llvm-svn: 104186
2024-11-24 11:42:57 +01:00 · 2010-05-19 23:43:12 +00:00 · 2010-05-19 23:43:12 +00:00 · 772b731ca5
commit 772b731ca5
parent 2dbe0d9886
2 changed files with 577 additions and 3 deletions
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -113,6 +113,7 @@ class RegUseTracker {
 public:
  void CountRegister(const SCEV *Reg, size_t LUIdx);
  void DropRegister(const SCEV *Reg, size_t LUIdx);
+  void DropUse(size_t LUIdx);

  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;

@ -150,6 +151,14 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
  RSD.UsedByIndices.reset(LUIdx);
 }

+void
+RegUseTracker::DropUse(size_t LUIdx) {
+  // Remove the use index from every register's use list.
+  for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
+       I != E; ++I)
+    I->second.UsedByIndices.reset(LUIdx);
+}
+
 bool
 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
  if (!RegUsesMap.count(Reg)) return false;
@ -951,6 +960,7 @@ public:
                                      MaxOffset(INT64_MIN),
                                      AllFixupsOutsideLoop(true) {}

+  bool HasFormulaWithSameRegs(const Formula &F) const;
  bool InsertFormula(const Formula &F);
  void DeleteFormula(Formula &F);
  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
@ -961,6 +971,16 @@ public:
  void dump() const;
 };

+/// HasFormula - Test whether this use as a formula which has the same
+/// registers as the given formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+  return Uniquifier.count(Key);
+}
+
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
@ -995,6 +1015,7 @@ bool LSRUse::InsertFormula(const Formula &F) {
 void LSRUse::DeleteFormula(Formula &F) {
  std::swap(F, Formulae.back());
  Formulae.pop_back();
+  assert(!Formulae.empty() && "LSRUse has no formulae left!");
 }

 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
@ -1134,6 +1155,13 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;

+  // Canonicalize a scale of 1 to a base register if the formula doesn't
+  // already have a base register.
+  if (!AM.HasBaseReg && AM.Scale == 1) {
+    AM.Scale = 0;
+    AM.HasBaseReg = true;
+  }
+
  return isLegalUse(AM, Kind, AccessTy, TLI);
 }

@ -1244,12 +1272,15 @@ class LSRInstance {
  UseMapTy UseMap;

  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+                          bool HasBaseReg,
                          LSRUse::KindType Kind, const Type *AccessTy);

  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
                                    LSRUse::KindType Kind,
                                    const Type *AccessTy);

+  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
 public:
  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
@ -1742,6 +1773,7 @@ LSRInstance::OptimizeLoopTermCond() {

 bool
 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+                                bool HasBaseReg,
                                LSRUse::KindType Kind, const Type *AccessTy) {
  int64_t NewMinOffset = LU.MinOffset;
  int64_t NewMaxOffset = LU.MaxOffset;
@ -1754,12 +1786,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
    return false;
  // Conservatively assume HasBaseReg is true for now.
  if (NewOffset < LU.MinOffset) {
-    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true,
+    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
                          Kind, AccessTy, TLI))
      return false;
    NewMinOffset = NewOffset;
  } else if (NewOffset > LU.MaxOffset) {
-    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true,
+    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
                          Kind, AccessTy, TLI))
      return false;
    NewMaxOffset = NewOffset;
@ -1798,7 +1830,7 @@ LSRInstance::getUse(const SCEV *&Expr,
    // A use already existed with this base.
    size_t LUIdx = P.first->second;
    LSRUse &LU = Uses[LUIdx];
-    if (reconcileNewOffset(LU, Offset, Kind, AccessTy))
+    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
      // Reuse this use.
      return std::make_pair(LUIdx, Offset);
  }
@ -1819,6 +1851,40 @@ LSRInstance::getUse(const SCEV *&Expr,
  return std::make_pair(LUIdx, Offset);
 }

+/// FindUseWithFormula - Look for a use distinct from OrigLU which is has
+/// a formula that has the same registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+                                       const LSRUse &OrigLU) {
+  // Search all uses for the formula. This could be more clever. Ignore
+  // ICmpZero uses because they may contain formulae generated by
+  // GenerateICmpZeroScales, in which case adding fixup offsets may
+  // be invalid.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    if (&LU != &OrigLU &&
+        LU.Kind != LSRUse::ICmpZero &&
+        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.HasFormulaWithSameRegs(OrigF)) {
+      for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+           FIdx != NumForms; ++FIdx) {
+        Formula &F = LU.Formulae[FIdx];
+        if (F.BaseRegs == OrigF.BaseRegs &&
+            F.ScaledReg == OrigF.ScaledReg &&
+            F.AM.BaseGV == OrigF.AM.BaseGV &&
+            F.AM.Scale == OrigF.AM.Scale &&
+            LU.Kind) {
+          if (F.AM.BaseOffs == 0)
+            return &LU;
+          break;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
 void LSRInstance::CollectInterestingTypesAndFactors() {
  SmallSetVector<const SCEV *, 4> Strides;

@ -2722,6 +2788,128 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
 /// of formulae. This keeps the main solver from taking an extraordinary amount
 /// of time in some worst-case scenarios.
 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+                    "which use a superset of registers used by other "
+                    "formulae.\n");
+
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        for (SmallVectorImpl<const SCEV *>::const_iterator
+             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+            Formula NewF = F;
+            NewF.AM.BaseOffs += C->getValue()->getSExtValue();
+            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                (I - F.BaseRegs.begin()));
+            if (LU.HasFormulaWithSameRegs(NewF)) {
+              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+              LU.DeleteFormula(F);
+              --i;
+              --e;
+              Any = true;
+              break;
+            }
+          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+              if (!F.AM.BaseGV) {
+                Formula NewF = F;
+                NewF.AM.BaseGV = GV;
+                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                    (I - F.BaseRegs.begin()));
+                if (LU.HasFormulaWithSameRegs(NewF)) {
+                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                        dbgs() << '\n');
+                  LU.DeleteFormula(F);
+                  --i;
+                  --e;
+                  Any = true;
+                  break;
+                }
+              }
+          }
+        }
+      }
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
+                    "separated by a constant offset will use the same "
+                    "registers.\n");
+
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
+          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
+            if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
+                                   /*HasBaseReg=*/false,
+                                   LU.Kind, LU.AccessTy)) {
+              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
+                    dbgs() << '\n');
+
+              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+              // Delete formulae from the new use which are no longer legal.
+              bool Any = false;
+              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+                Formula &F = LUThatHas->Formulae[i];
+                if (!isLegalUse(F.AM,
+                                LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                                LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
+                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                        dbgs() << '\n');
+                  LUThatHas->DeleteFormula(F);
+                  --i;
+                  --e;
+                  Any = true;
+                }
+              }
+              if (Any)
+                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+              // Update the relocs to reference the new use.
+              for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
+                if (Fixups[i].LUIdx == LUIdx) {
+                  Fixups[i].LUIdx = LUThatHas - &Uses.front();
+                  Fixups[i].Offset += F.AM.BaseOffs;
+                  DEBUG(errs() << "New fixup has offset "
+                               << Fixups[i].Offset << "\n");
+                }
+                if (Fixups[i].LUIdx == NumUses-1)
+                  Fixups[i].LUIdx = LUIdx;
+              }
+
+              // Delete the old use.
+              std::swap(LU, Uses.back());
+              Uses.pop_back();
+              --LUIdx;
+              --NumUses;
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+
  SmallPtrSet<const SCEV *, 4> Taken;
  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    // Ok, we have too many of formulae on our hands to conveniently handle.
--- a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
+++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
@ -0,0 +1,386 @@
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
+
+; LSR should recognize that this is an unrolled loop which can use
+; constant offset addressing, so that each of the following stores
+; uses the same register.
+
+; CHECK: vstr.32 s0, [r12, #-128]
+; CHECK: vstr.32 s0, [r12, #-96]
+; CHECK: vstr.32 s0, [r12, #-64]
+; CHECK: vstr.32 s0, [r12, #-32]
+; CHECK: vstr.32 s0, [r12]
+; CHECK: vstr.32 s0, [r12, #32]
+; CHECK: vstr.32 s0, [r12, #64]
+; CHECK: vstr.32 s0, [r12, #96]
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+
+%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
+%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 }
+%2 = type { %1*, %3*, %6*, i8*, i32, i32 }
+%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 }
+%4 = type opaque
+%5 = type opaque
+%6 = type { void (%2*)*, i32, i32, i32, i32 }
+%7 = type { [8 x i32], [12 x i32] }
+%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* }
+%9 = type { [64 x i16], i32 }
+%10 = type { [17 x i8], [256 x i8], i32 }
+%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* }
+%12 = type { %12*, i8, i32, i32, i8* }
+%13 = type { void (%0*)*, void (%0*)*, i32 }
+%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* }
+%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** }
+%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* }
+%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 }
+%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 }
+%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 }
+%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] }
+%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 }
+%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* }
+%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* }
+
+define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind {
+bb:
+  %t = alloca [64 x float], align 4           
+  %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65
+  %t6 = load i8** %t5, align 4              
+  %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20
+  %t8 = load i8** %t7, align 4              
+  br label %bb9
+
+bb9:                                            
+  %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ]
+  %t11 = add i32 %t10, 8                    
+  %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11
+  %t13 = add i32 %t10, 16                   
+  %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13
+  %t15 = add i32 %t10, 24                   
+  %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15
+  %t17 = add i32 %t10, 32                   
+  %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17
+  %t19 = add i32 %t10, 40                   
+  %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19
+  %t21 = add i32 %t10, 48                   
+  %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21
+  %t23 = add i32 %t10, 56                   
+  %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23
+  %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10
+  %t26 = shl i32 %t10, 5                    
+  %t27 = or i32 %t26, 8                     
+  %t28 = getelementptr i8* %t8, i32 %t27  
+  %t29 = bitcast i8* %t28 to float*         
+  %t30 = or i32 %t26, 16                    
+  %t31 = getelementptr i8* %t8, i32 %t30  
+  %t32 = bitcast i8* %t31 to float*         
+  %t33 = or i32 %t26, 24                    
+  %t34 = getelementptr i8* %t8, i32 %t33  
+  %t35 = bitcast i8* %t34 to float*         
+  %t36 = or i32 %t26, 4                     
+  %t37 = getelementptr i8* %t8, i32 %t36  
+  %t38 = bitcast i8* %t37 to float*         
+  %t39 = or i32 %t26, 12                    
+  %t40 = getelementptr i8* %t8, i32 %t39  
+  %t41 = bitcast i8* %t40 to float*         
+  %t42 = or i32 %t26, 20                    
+  %t43 = getelementptr i8* %t8, i32 %t42  
+  %t44 = bitcast i8* %t43 to float*         
+  %t45 = or i32 %t26, 28                    
+  %t46 = getelementptr i8* %t8, i32 %t45  
+  %t47 = bitcast i8* %t46 to float*         
+  %t48 = getelementptr i8* %t8, i32 %t26  
+  %t49 = bitcast i8* %t48 to float*         
+  %t50 = shl i32 %t10, 3                    
+  %t51 = or i32 %t50, 1                     
+  %t52 = getelementptr i16* %a2, i32 %t51 
+  %t53 = or i32 %t50, 2                     
+  %t54 = getelementptr i16* %a2, i32 %t53 
+  %t55 = or i32 %t50, 3                     
+  %t56 = getelementptr i16* %a2, i32 %t55 
+  %t57 = or i32 %t50, 4                     
+  %t58 = getelementptr i16* %a2, i32 %t57 
+  %t59 = or i32 %t50, 5                     
+  %t60 = getelementptr i16* %a2, i32 %t59 
+  %t61 = or i32 %t50, 6                     
+  %t62 = getelementptr i16* %a2, i32 %t61 
+  %t63 = or i32 %t50, 7                     
+  %t64 = getelementptr i16* %a2, i32 %t63 
+  %t65 = getelementptr i16* %a2, i32 %t50 
+  %t66 = load i16* %t52, align 2            
+  %t67 = icmp eq i16 %t66, 0                
+  %t68 = load i16* %t54, align 2            
+  %t69 = icmp eq i16 %t68, 0                
+  %t70 = and i1 %t67, %t69                
+  br i1 %t70, label %bb71, label %bb91
+
+bb71:                                           
+  %t72 = load i16* %t56, align 2            
+  %t73 = icmp eq i16 %t72, 0                
+  br i1 %t73, label %bb74, label %bb91
+
+bb74:                                           
+  %t75 = load i16* %t58, align 2            
+  %t76 = icmp eq i16 %t75, 0                
+  br i1 %t76, label %bb77, label %bb91
+
+bb77:                                           
+  %t78 = load i16* %t60, align 2            
+  %t79 = icmp eq i16 %t78, 0                
+  br i1 %t79, label %bb80, label %bb91
+
+bb80:                                           
+  %t81 = load i16* %t62, align 2            
+  %t82 = icmp eq i16 %t81, 0                
+  br i1 %t82, label %bb83, label %bb91
+
+bb83:                                           
+  %t84 = load i16* %t64, align 2            
+  %t85 = icmp eq i16 %t84, 0                
+  br i1 %t85, label %bb86, label %bb91
+
+bb86:                                           
+  %t87 = load i16* %t65, align 2            
+  %t88 = sitofp i16 %t87 to float           
+  %t89 = load float* %t49, align 4          
+  %t90 = fmul float %t88, %t89            
+  store float %t90, float* %t25, align 4
+  store float %t90, float* %t12, align 4
+  store float %t90, float* %t14, align 4
+  store float %t90, float* %t16, align 4
+  store float %t90, float* %t18, align 4
+  store float %t90, float* %t20, align 4
+  store float %t90, float* %t22, align 4
+  store float %t90, float* %t24, align 4
+  br label %bb156
+
+bb91:                                           
+  %t92 = load i16* %t65, align 2            
+  %t93 = sitofp i16 %t92 to float           
+  %t94 = load float* %t49, align 4          
+  %t95 = fmul float %t93, %t94            
+  %t96 = sitofp i16 %t68 to float           
+  %t97 = load float* %t29, align 4          
+  %t98 = fmul float %t96, %t97            
+  %t99 = load i16* %t58, align 2            
+  %t100 = sitofp i16 %t99 to float          
+  %t101 = load float* %t32, align 4         
+  %t102 = fmul float %t100, %t101         
+  %t103 = load i16* %t62, align 2           
+  %t104 = sitofp i16 %t103 to float         
+  %t105 = load float* %t35, align 4         
+  %t106 = fmul float %t104, %t105         
+  %t107 = fadd float %t95, %t102          
+  %t108 = fsub float %t95, %t102          
+  %t109 = fadd float %t98, %t106          
+  %t110 = fsub float %t98, %t106          
+  %t111 = fmul float %t110, 0x3FF6A09E60000000
+  %t112 = fsub float %t111, %t109         
+  %t113 = fadd float %t107, %t109         
+  %t114 = fsub float %t107, %t109         
+  %t115 = fadd float %t108, %t112         
+  %t116 = fsub float %t108, %t112         
+  %t117 = sitofp i16 %t66 to float          
+  %t118 = load float* %t38, align 4         
+  %t119 = fmul float %t117, %t118         
+  %t120 = load i16* %t56, align 2           
+  %t121 = sitofp i16 %t120 to float         
+  %t122 = load float* %t41, align 4         
+  %t123 = fmul float %t121, %t122         
+  %t124 = load i16* %t60, align 2           
+  %t125 = sitofp i16 %t124 to float         
+  %t126 = load float* %t44, align 4         
+  %t127 = fmul float %t125, %t126         
+  %t128 = load i16* %t64, align 2           
+  %t129 = sitofp i16 %t128 to float         
+  %t130 = load float* %t47, align 4         
+  %t131 = fmul float %t129, %t130         
+  %t132 = fadd float %t127, %t123         
+  %t133 = fsub float %t127, %t123         
+  %t134 = fadd float %t119, %t131         
+  %t135 = fsub float %t119, %t131         
+  %t136 = fadd float %t134, %t132         
+  %t137 = fsub float %t134, %t132         
+  %t138 = fmul float %t137, 0x3FF6A09E60000000
+  %t139 = fadd float %t133, %t135         
+  %t140 = fmul float %t139, 0x3FFD906BC0000000
+  %t141 = fmul float %t135, 0x3FF1517A80000000
+  %t142 = fsub float %t141, %t140         
+  %t143 = fmul float %t133, 0xC004E7AEA0000000
+  %t144 = fadd float %t143, %t140         
+  %t145 = fsub float %t144, %t136         
+  %t146 = fsub float %t138, %t145         
+  %t147 = fadd float %t142, %t146         
+  %t148 = fadd float %t113, %t136         
+  store float %t148, float* %t25, align 4
+  %t149 = fsub float %t113, %t136         
+  store float %t149, float* %t24, align 4
+  %t150 = fadd float %t115, %t145         
+  store float %t150, float* %t12, align 4
+  %t151 = fsub float %t115, %t145         
+  store float %t151, float* %t22, align 4
+  %t152 = fadd float %t116, %t146         
+  store float %t152, float* %t14, align 4
+  %t153 = fsub float %t116, %t146         
+  store float %t153, float* %t20, align 4
+  %t154 = fadd float %t114, %t147         
+  store float %t154, float* %t18, align 4
+  %t155 = fsub float %t114, %t147         
+  store float %t155, float* %t16, align 4
+  br label %bb156
+
+bb156:                                          
+  %t157 = add i32 %t10, 1                   
+  %t158 = icmp eq i32 %t157, 8              
+  br i1 %t158, label %bb159, label %bb9
+
+bb159:                                          
+  %t160 = add i32 %a4, 7                    
+  %t161 = add i32 %a4, 1                    
+  %t162 = add i32 %a4, 6                    
+  %t163 = add i32 %a4, 2                    
+  %t164 = add i32 %a4, 5                    
+  %t165 = add i32 %a4, 4                    
+  %t166 = add i32 %a4, 3                    
+  br label %bb167
+
+bb167:                                          
+  %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ]
+  %t169 = getelementptr i8** %a3, i32 %t168
+  %t170 = shl i32 %t168, 3                  
+  %t171 = or i32 %t170, 4                   
+  %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171
+  %t173 = or i32 %t170, 2                   
+  %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173
+  %t175 = or i32 %t170, 6                   
+  %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175
+  %t177 = or i32 %t170, 5                   
+  %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177
+  %t179 = or i32 %t170, 3                   
+  %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179
+  %t181 = or i32 %t170, 1                   
+  %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181
+  %t183 = or i32 %t170, 7                   
+  %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183
+  %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170
+  %t186 = load i8** %t169, align 4          
+  %t187 = getelementptr inbounds i8* %t186, i32 %a4
+  %t188 = load float* %t185, align 4        
+  %t189 = load float* %t172, align 4        
+  %t190 = fadd float %t188, %t189         
+  %t191 = fsub float %t188, %t189         
+  %t192 = load float* %t174, align 4        
+  %t193 = load float* %t176, align 4        
+  %t194 = fadd float %t192, %t193         
+  %t195 = fsub float %t192, %t193         
+  %t196 = fmul float %t195, 0x3FF6A09E60000000
+  %t197 = fsub float %t196, %t194         
+  %t198 = fadd float %t190, %t194         
+  %t199 = fsub float %t190, %t194         
+  %t200 = fadd float %t191, %t197         
+  %t201 = fsub float %t191, %t197         
+  %t202 = load float* %t178, align 4        
+  %t203 = load float* %t180, align 4        
+  %t204 = fadd float %t202, %t203         
+  %t205 = fsub float %t202, %t203         
+  %t206 = load float* %t182, align 4        
+  %t207 = load float* %t184, align 4        
+  %t208 = fadd float %t206, %t207         
+  %t209 = fsub float %t206, %t207         
+  %t210 = fadd float %t208, %t204         
+  %t211 = fsub float %t208, %t204         
+  %t212 = fmul float %t211, 0x3FF6A09E60000000
+  %t213 = fadd float %t205, %t209         
+  %t214 = fmul float %t213, 0x3FFD906BC0000000
+  %t215 = fmul float %t209, 0x3FF1517A80000000
+  %t216 = fsub float %t215, %t214         
+  %t217 = fmul float %t205, 0xC004E7AEA0000000
+  %t218 = fadd float %t217, %t214         
+  %t219 = fsub float %t218, %t210         
+  %t220 = fsub float %t212, %t219         
+  %t221 = fadd float %t216, %t220         
+  %t222 = fadd float %t198, %t210         
+  %t223 = fptosi float %t222 to i32         
+  %t224 = add nsw i32 %t223, 4              
+  %t225 = lshr i32 %t224, 3                 
+  %t226 = and i32 %t225, 1023               
+  %t227 = add i32 %t226, 128                
+  %t228 = getelementptr inbounds i8* %t6, i32 %t227
+  %t229 = load i8* %t228, align 1           
+  store i8 %t229, i8* %t187, align 1
+  %t230 = fsub float %t198, %t210         
+  %t231 = fptosi float %t230 to i32         
+  %t232 = add nsw i32 %t231, 4              
+  %t233 = lshr i32 %t232, 3                 
+  %t234 = and i32 %t233, 1023               
+  %t235 = add i32 %t234, 128                
+  %t236 = getelementptr inbounds i8* %t6, i32 %t235
+  %t237 = load i8* %t236, align 1           
+  %t238 = getelementptr inbounds i8* %t186, i32 %t160
+  store i8 %t237, i8* %t238, align 1
+  %t239 = fadd float %t200, %t219         
+  %t240 = fptosi float %t239 to i32         
+  %t241 = add nsw i32 %t240, 4              
+  %t242 = lshr i32 %t241, 3                 
+  %t243 = and i32 %t242, 1023               
+  %t244 = add i32 %t243, 128                
+  %t245 = getelementptr inbounds i8* %t6, i32 %t244
+  %t246 = load i8* %t245, align 1           
+  %t247 = getelementptr inbounds i8* %t186, i32 %t161
+  store i8 %t246, i8* %t247, align 1
+  %t248 = fsub float %t200, %t219         
+  %t249 = fptosi float %t248 to i32         
+  %t250 = add nsw i32 %t249, 4              
+  %t251 = lshr i32 %t250, 3                 
+  %t252 = and i32 %t251, 1023               
+  %t253 = add i32 %t252, 128                
+  %t254 = getelementptr inbounds i8* %t6, i32 %t253
+  %t255 = load i8* %t254, align 1           
+  %t256 = getelementptr inbounds i8* %t186, i32 %t162
+  store i8 %t255, i8* %t256, align 1
+  %t257 = fadd float %t201, %t220         
+  %t258 = fptosi float %t257 to i32         
+  %t259 = add nsw i32 %t258, 4              
+  %t260 = lshr i32 %t259, 3                 
+  %t261 = and i32 %t260, 1023               
+  %t262 = add i32 %t261, 128                
+  %t263 = getelementptr inbounds i8* %t6, i32 %t262
+  %t264 = load i8* %t263, align 1           
+  %t265 = getelementptr inbounds i8* %t186, i32 %t163
+  store i8 %t264, i8* %t265, align 1
+  %t266 = fsub float %t201, %t220         
+  %t267 = fptosi float %t266 to i32         
+  %t268 = add nsw i32 %t267, 4              
+  %t269 = lshr i32 %t268, 3                 
+  %t270 = and i32 %t269, 1023               
+  %t271 = add i32 %t270, 128                
+  %t272 = getelementptr inbounds i8* %t6, i32 %t271
+  %t273 = load i8* %t272, align 1           
+  %t274 = getelementptr inbounds i8* %t186, i32 %t164
+  store i8 %t273, i8* %t274, align 1
+  %t275 = fadd float %t199, %t221         
+  %t276 = fptosi float %t275 to i32         
+  %t277 = add nsw i32 %t276, 4              
+  %t278 = lshr i32 %t277, 3                 
+  %t279 = and i32 %t278, 1023               
+  %t280 = add i32 %t279, 128                
+  %t281 = getelementptr inbounds i8* %t6, i32 %t280
+  %t282 = load i8* %t281, align 1           
+  %t283 = getelementptr inbounds i8* %t186, i32 %t165
+  store i8 %t282, i8* %t283, align 1
+  %t284 = fsub float %t199, %t221         
+  %t285 = fptosi float %t284 to i32         
+  %t286 = add nsw i32 %t285, 4              
+  %t287 = lshr i32 %t286, 3                 
+  %t288 = and i32 %t287, 1023               
+  %t289 = add i32 %t288, 128                
+  %t290 = getelementptr inbounds i8* %t6, i32 %t289
+  %t291 = load i8* %t290, align 1           
+  %t292 = getelementptr inbounds i8* %t186, i32 %t166
+  store i8 %t291, i8* %t292, align 1
+  %t293 = add nsw i32 %t168, 1              
+  %t294 = icmp eq i32 %t293, 8              
+  br i1 %t294, label %bb295, label %bb167
+
+bb295:                                          
+  ret void
+}