1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

AMDGPU: Partial ILP scheduler port from SelectionDAG to SchedulingDAG (experimental)

Differential revision: https://reviews.llvm.org/D39897

llvm-svn: 318649
This commit is contained in:
Valery Pykhtin 2017-11-20 14:35:53 +00:00
parent b76bf11f90
commit 5d88936670
6 changed files with 1020 additions and 4 deletions

View File

@ -219,6 +219,16 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
}
static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@ -242,6 +252,11 @@ GCNMinRegSchedRegistry("gcn-minreg",
"Run GCN iterative scheduler for minimal register usage (experimental)",
createMinRegScheduler);
static MachineSchedRegistry
GCNILPSchedRegistry("gcn-ilp",
"Run GCN iterative scheduler for ILP scheduling (experimental)",
createIterativeILPMachineScheduler);
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.

View File

@ -95,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
GCNILPSched.cpp
)
add_subdirectory(AsmParser)

View File

@ -0,0 +1,364 @@
//===---------------------------- GCNILPSched.cpp - -----------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ScheduleDAG.h"
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
namespace {
class GCNILPScheduler {
struct Candidate : ilist_node<Candidate> {
SUnit *SU;
Candidate(SUnit *SU_)
: SU(SU_) {}
};
SpecificBumpPtrAllocator<Candidate> Alloc;
typedef simple_ilist<Candidate> Queue;
Queue PendingQueue;
Queue AvailQueue;
unsigned CurQueueId = 0;
std::vector<unsigned> SUNumbers;
/// CurCycle - The current scheduler state corresponds to this cycle.
unsigned CurCycle = 0;
unsigned getNodePriority(const SUnit *SU) const;
const SUnit *pickBest(const SUnit *left, const SUnit *right);
Candidate* pickCandidate();
void releasePending();
void advanceToCycle(unsigned NextCycle);
void releasePredecessors(const SUnit* SU);
public:
std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG);
};
} // namespace
/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
/// Smaller number is the higher priority.
static unsigned
CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
if (SethiUllmanNumber != 0)
return SethiUllmanNumber;
unsigned Extra = 0;
for (const SDep &Pred : SU->Preds) {
if (Pred.isCtrl()) continue; // ignore chain preds
SUnit *PredSU = Pred.getSUnit();
unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
if (PredSethiUllman > SethiUllmanNumber) {
SethiUllmanNumber = PredSethiUllman;
Extra = 0;
}
else if (PredSethiUllman == SethiUllmanNumber)
++Extra;
}
SethiUllmanNumber += Extra;
if (SethiUllmanNumber == 0)
SethiUllmanNumber = 1;
return SethiUllmanNumber;
}
// Lower priority means schedule further down. For bottom-up scheduling, lower
// priority SUs are scheduled before higher priority SUs.
unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const {
assert(SU->NodeNum < SUNumbers.size());
if (SU->NumSuccs == 0 && SU->NumPreds != 0)
// If SU does not have a register use, i.e. it doesn't produce a value
// that would be consumed (e.g. store), then it terminates a chain of
// computation. Give it a large SethiUllman number so it will be
// scheduled right before its predecessors that it doesn't lengthen
// their live ranges.
return 0xffff;
if (SU->NumPreds == 0 && SU->NumSuccs != 0)
// If SU does not have a register def, schedule it close to its uses
// because it does not lengthen any live ranges.
return 0;
return SUNumbers[SU->NodeNum];
}
/// closestSucc - Returns the scheduled cycle of the successor which is
/// closest to the current cycle.
static unsigned closestSucc(const SUnit *SU) {
unsigned MaxHeight = 0;
for (const SDep &Succ : SU->Succs) {
if (Succ.isCtrl()) continue; // ignore chain succs
unsigned Height = Succ.getSUnit()->getHeight();
// If there are bunch of CopyToRegs stacked up, they should be considered
// to be at the same position.
if (Height > MaxHeight)
MaxHeight = Height;
}
return MaxHeight;
}
/// calcMaxScratches - Returns an cost estimate of the worse case requirement
/// for scratch registers, i.e. number of data dependencies.
static unsigned calcMaxScratches(const SUnit *SU) {
unsigned Scratches = 0;
for (const SDep &Pred : SU->Preds) {
if (Pred.isCtrl()) continue; // ignore chain preds
Scratches++;
}
return Scratches;
}
// Return -1 if left has higher priority, 1 if right has higher priority.
// Return 0 if latency-based priority is equivalent.
static int BUCompareLatency(const SUnit *left, const SUnit *right) {
// Scheduling an instruction that uses a VReg whose postincrement has not yet
// been scheduled will induce a copy. Model this as an extra cycle of latency.
int LHeight = (int)left->getHeight();
int RHeight = (int)right->getHeight();
// If either node is scheduling for latency, sort them by height/depth
// and latency.
// If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
// is enabled, grouping instructions by cycle, then its height is already
// covered so only its depth matters. We also reach this point if both stall
// but have the same height.
if (LHeight != RHeight)
return LHeight > RHeight ? 1 : -1;
int LDepth = left->getDepth();
int RDepth = right->getDepth();
if (LDepth != RDepth) {
DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
<< ") depth " << LDepth << " vs SU (" << right->NodeNum
<< ") depth " << RDepth << "\n");
return LDepth < RDepth ? 1 : -1;
}
if (left->Latency != right->Latency)
return left->Latency > right->Latency ? 1 : -1;
return 0;
}
const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
{
// TODO: add register pressure lowering checks
bool const DisableSchedCriticalPath = false;
int MaxReorderWindow = 6;
if (!DisableSchedCriticalPath) {
int spread = (int)left->getDepth() - (int)right->getDepth();
if (std::abs(spread) > MaxReorderWindow) {
DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
<< left->getDepth() << " != SU(" << right->NodeNum << "): "
<< right->getDepth() << "\n");
return left->getDepth() < right->getDepth() ? right : left;
}
}
bool const DisableSchedHeight = false;
if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
int spread = (int)left->getHeight() - (int)right->getHeight();
if (std::abs(spread) > MaxReorderWindow)
return left->getHeight() > right->getHeight() ? right : left;
}
// Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
unsigned LPriority = getNodePriority(left);
unsigned RPriority = getNodePriority(right);
if (LPriority != RPriority)
return LPriority > RPriority ? right : left;
// Try schedule def + use closer when Sethi-Ullman numbers are the same.
// e.g.
// t1 = op t2, c1
// t3 = op t4, c2
//
// and the following instructions are both ready.
// t2 = op c3
// t4 = op c4
//
// Then schedule t2 = op first.
// i.e.
// t4 = op c4
// t2 = op c3
// t1 = op t2, c1
// t3 = op t4, c2
//
// This creates more short live intervals.
unsigned LDist = closestSucc(left);
unsigned RDist = closestSucc(right);
if (LDist != RDist)
return LDist < RDist ? right : left;
// How many registers becomes live when the node is scheduled.
unsigned LScratch = calcMaxScratches(left);
unsigned RScratch = calcMaxScratches(right);
if (LScratch != RScratch)
return LScratch > RScratch ? right : left;
bool const DisableSchedCycles = false;
if (!DisableSchedCycles) {
int result = BUCompareLatency(left, right);
if (result != 0)
return result > 0 ? right : left;
return left;
}
else {
if (left->getHeight() != right->getHeight())
return (left->getHeight() > right->getHeight()) ? right : left;
if (left->getDepth() != right->getDepth())
return (left->getDepth() < right->getDepth()) ? right : left;
}
assert(left->NodeQueueId && right->NodeQueueId &&
"NodeQueueId cannot be zero");
return (left->NodeQueueId > right->NodeQueueId) ? right : left;
}
GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() {
if (AvailQueue.empty())
return nullptr;
auto Best = AvailQueue.begin();
for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) {
auto NewBestSU = pickBest(Best->SU, I->SU);
if (NewBestSU != Best->SU) {
assert(NewBestSU == I->SU);
Best = I;
}
}
return &*Best;
}
void GCNILPScheduler::releasePending() {
// Check to see if any of the pending instructions are ready to issue. If
// so, add them to the available queue.
for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) {
auto &C = *I++;
if (C.SU->getHeight() <= CurCycle) {
PendingQueue.remove(C);
AvailQueue.push_back(C);
C.SU->NodeQueueId = CurQueueId++;
}
}
}
/// Move the scheduler state forward by the specified number of Cycles.
void GCNILPScheduler::advanceToCycle(unsigned NextCycle) {
if (NextCycle <= CurCycle)
return;
CurCycle = NextCycle;
releasePending();
}
void GCNILPScheduler::releasePredecessors(const SUnit* SU) {
for (const auto &PredEdge : SU->Preds) {
auto PredSU = PredEdge.getSUnit();
if (PredEdge.isWeak())
continue;
assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0);
PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency());
if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0)
PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU));
}
}
std::vector<const SUnit*>
GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
const ScheduleDAG &DAG) {
auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits;
std::vector<SUnit> SUSavedCopy;
SUSavedCopy.resize(SUnits.size());
// we cannot save only those fields we touch: some of them are private
// so save units verbatim: this assumes SUnit should have value semantics
for (const SUnit &SU : SUnits)
SUSavedCopy[SU.NodeNum] = SU;
SUNumbers.assign(SUnits.size(), 0);
for (const SUnit &SU : SUnits)
CalcNodeSethiUllmanNumber(&SU, SUNumbers);
for (auto SU : BotRoots) {
AvailQueue.push_back(
*new (Alloc.Allocate()) Candidate(const_cast<SUnit*>(SU)));
}
releasePredecessors(&DAG.ExitSU);
std::vector<const SUnit*> Schedule;
Schedule.reserve(SUnits.size());
while (true) {
if (AvailQueue.empty() && !PendingQueue.empty()) {
auto EarliestSU = std::min_element(
PendingQueue.begin(), PendingQueue.end(),
[=](const Candidate& C1, const Candidate& C2) {
return C1.SU->getHeight() < C2.SU->getHeight();
})->SU;
advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight()));
}
if (AvailQueue.empty())
break;
DEBUG(
dbgs() << "\n=== Picking candidate\n"
"Ready queue:";
for (auto &C : AvailQueue)
dbgs() << ' ' << C.SU->NodeNum;
dbgs() << '\n';
);
auto C = pickCandidate();
assert(C);
AvailQueue.remove(*C);
auto SU = C->SU;
DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
advanceToCycle(SU->getHeight());
releasePredecessors(SU);
Schedule.push_back(SU);
SU->isScheduled = true;
}
assert(SUnits.size() == Schedule.size());
std::reverse(Schedule.begin(), Schedule.end());
// restore units
for (auto &SU : SUnits)
SU = SUSavedCopy[SU.NodeNum];
return Schedule;
}
namespace llvm {
std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots,
const ScheduleDAG &DAG) {
GCNILPScheduler S;
return S.schedule(BotRoots, DAG);
}
}

View File

@ -39,7 +39,9 @@ namespace llvm {
std::vector<const SUnit *> makeMinRegSchedule(ArrayRef<const SUnit *> TopRoots,
const ScheduleDAG &DAG);
} // end namespace llvm
std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots,
const ScheduleDAG &DAG);
}
// shim accessors for different order containers
static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
@ -141,6 +143,7 @@ class GCNIterativeScheduler::BuildDAG {
GCNIterativeScheduler &Sch;
SmallVector<SUnit *, 8> TopRoots;
SmallVector<SUnit*, 8> BotRoots;
public:
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
: Sch(_Sch) {
@ -151,8 +154,6 @@ public:
Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
/*TrackLaneMask*/true);
Sch.Topo.InitDAGTopologicalSorting();
SmallVector<SUnit *, 8> BotRoots;
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
}
@ -164,6 +165,9 @@ public:
ArrayRef<const SUnit *> getTopRoots() const {
return TopRoots;
}
ArrayRef<SUnit*> getBottomRoots() const {
return BotRoots;
}
};
class GCNIterativeScheduler::OverrideLegacyStrategy {
@ -323,6 +327,7 @@ void GCNIterativeScheduler::finalizeSchedule() { // overriden
case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
case SCHEDULE_ILP: scheduleILP(false); break;
}
}
@ -553,3 +558,43 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
MaxPressure = RP;
}
}
///////////////////////////////////////////////////////////////////////////////
// ILP scheduler port
void GCNIterativeScheduler::scheduleILP(
bool TryMaximizeOccupancy) {
const auto &ST = MF.getSubtarget<SISubtarget>();
auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
ST.getWavesPerEU(*MF.getFunction()).second);
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
if (TryMaximizeOccupancy && Occ < TgtOcc)
Occ = tryMaximizeOccupancy(TgtOcc);
TgtOcc = std::min(Occ, TgtOcc);
DEBUG(dbgs() << "Scheduling using default scheduler, "
"target occupancy = " << TgtOcc << '\n');
for (auto R : Regions) {
BuildDAG DAG(*R, *this);
const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
const auto RP = getSchedulePressure(*R, ILPSchedule);
DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
DEBUG(printSchedResult(dbgs(), R, RP));
}
}
}

View File

@ -32,7 +32,8 @@ public:
enum StrategyKind {
SCHEDULE_MINREGONLY,
SCHEDULE_MINREGFORCED,
SCHEDULE_LEGACYMAXOCCUPANCY
SCHEDULE_LEGACYMAXOCCUPANCY,
SCHEDULE_ILP
};
GCNIterativeScheduler(MachineSchedContext *C,
@ -108,6 +109,7 @@ protected:
void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
void scheduleMinReg(bool force = false);
void scheduleILP(bool TryMaximizeOccupancy = true);
void printRegions(raw_ostream &OS) const;
void printSchedResult(raw_ostream &OS,

View File

@ -0,0 +1,589 @@
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) #0 {
bb:
%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
%tmp2 = load float, float addrspace(3)* %tmp, align 4
%tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
%tmp4 = load float, float addrspace(3)* %tmp3, align 4
%tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
%tmp6 = load float, float addrspace(3)* %tmp5, align 4
%tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
%tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
%tmp9 = load float, float addrspace(3)* %tmp8, align 4
%tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
%tmp11 = load float, float addrspace(3)* %tmp10, align 4
%tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
%tmp13 = load float, float addrspace(3)* %tmp12, align 4
%tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
%tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
%tmp16 = load float, float addrspace(3)* %tmp15, align 4
%tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
%tmp18 = load float, float addrspace(3)* %tmp17, align 4
%tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
%tmp20 = load float, float addrspace(3)* %tmp19, align 4
%tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
%tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
%tmp23 = load float, float addrspace(3)* %tmp22, align 4
%tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
%tmp25 = load float, float addrspace(3)* %tmp24, align 4
%tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
%tmp27 = load float, float addrspace(3)* %tmp26, align 4
%tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
%tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
%tmp30 = load float, float addrspace(3)* %tmp29, align 4
%tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
%tmp32 = load float, float addrspace(3)* %tmp31, align 4
%tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
%tmp34 = load float, float addrspace(3)* %tmp33, align 4
%tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
%tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
%tmp37 = load float, float addrspace(3)* %tmp36, align 4
%tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
%tmp39 = load float, float addrspace(3)* %tmp38, align 4
%tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
%tmp41 = load float, float addrspace(3)* %tmp40, align 4
%tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
%tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
%tmp44 = load float, float addrspace(3)* %tmp43, align 4
%tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
%tmp46 = load float, float addrspace(3)* %tmp45, align 4
%tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
%tmp48 = load float, float addrspace(3)* %tmp47, align 4
%tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
%tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
%tmp51 = load float, float addrspace(3)* %tmp50, align 4
%tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
%tmp53 = load float, float addrspace(3)* %tmp52, align 4
%tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
%tmp55 = load float, float addrspace(3)* %tmp54, align 4
%tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
%tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
%tmp58 = load float, float addrspace(3)* %tmp57, align 4
%tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
%tmp60 = load float, float addrspace(3)* %tmp59, align 4
%tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
%tmp62 = load float, float addrspace(3)* %tmp61, align 4
%tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
%tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
%tmp65 = load float, float addrspace(3)* %tmp64, align 4
%tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
%tmp67 = load float, float addrspace(3)* %tmp66, align 4
%tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
%tmp69 = load float, float addrspace(3)* %tmp68, align 4
%tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
%tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
%tmp72 = load float, float addrspace(3)* %tmp71, align 4
%tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
%tmp74 = load float, float addrspace(3)* %tmp73, align 4
%tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
%tmp76 = load float, float addrspace(3)* %tmp75, align 4
%tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
%tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
%tmp79 = load float, float addrspace(3)* %tmp78, align 4
%tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
%tmp81 = load float, float addrspace(3)* %tmp80, align 4
%tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
%tmp83 = load float, float addrspace(3)* %tmp82, align 4
%tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
%tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
%tmp86 = load float, float addrspace(3)* %tmp85, align 4
%tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
%tmp88 = load float, float addrspace(3)* %tmp87, align 4
%tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
%tmp90 = load float, float addrspace(3)* %tmp89, align 4
%tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
%tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
%tmp93 = load float, float addrspace(3)* %tmp92, align 4
%tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
%tmp95 = load float, float addrspace(3)* %tmp94, align 4
%tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
%tmp97 = load float, float addrspace(3)* %tmp96, align 4
%tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
%tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
%tmp100 = load float, float addrspace(3)* %tmp99, align 4
%tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
%tmp102 = load float, float addrspace(3)* %tmp101, align 4
%tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
%tmp104 = load float, float addrspace(3)* %tmp103, align 4
%tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
%tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
%tmp107 = load float, float addrspace(3)* %tmp106, align 4
%tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
%tmp109 = load float, float addrspace(3)* %tmp108, align 4
%tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
%tmp111 = load float, float addrspace(3)* %tmp110, align 4
%tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
%tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
%tmp114 = load float, float addrspace(3)* %tmp113, align 4
%tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
%tmp116 = load float, float addrspace(3)* %tmp115, align 4
%tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
%tmp118 = load float, float addrspace(3)* %tmp117, align 4
%tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
%tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
%tmp121 = load float, float addrspace(3)* %tmp120, align 4
%tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
%tmp123 = load float, float addrspace(3)* %tmp122, align 4
%tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
%tmp125 = load float, float addrspace(3)* %tmp124, align 4
%tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
%tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
%tmp128 = load float, float addrspace(3)* %tmp127, align 4
%tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
%tmp130 = load float, float addrspace(3)* %tmp129, align 4
%tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
%tmp132 = load float, float addrspace(3)* %tmp131, align 4
%tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
%tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
%tmp135 = load float, float addrspace(3)* %tmp134, align 4
%tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
%tmp137 = load float, float addrspace(3)* %tmp136, align 4
%tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
%tmp139 = load float, float addrspace(3)* %tmp138, align 4
%tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
%tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
%tmp142 = load float, float addrspace(3)* %tmp141, align 4
%tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
%tmp144 = load float, float addrspace(3)* %tmp143, align 4
%tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
%tmp146 = load float, float addrspace(3)* %tmp145, align 4
%tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
%tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
%tmp149 = load float, float addrspace(3)* %tmp148, align 4
%tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
%tmp151 = load float, float addrspace(3)* %tmp150, align 4
%tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
%tmp153 = load float, float addrspace(3)* %tmp152, align 4
%tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
%tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
%tmp156 = load float, float addrspace(3)* %tmp155, align 4
%tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
%tmp158 = load float, float addrspace(3)* %tmp157, align 4
%tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
%tmp160 = load float, float addrspace(3)* %tmp159, align 4
%tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
%tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
%tmp163 = load float, float addrspace(3)* %tmp162, align 4
%tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
%tmp165 = load float, float addrspace(3)* %tmp164, align 4
%tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
%tmp167 = load float, float addrspace(3)* %tmp166, align 4
%tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
%tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
%tmp170 = load float, float addrspace(3)* %tmp169, align 4
%tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
%tmp172 = load float, float addrspace(3)* %tmp171, align 4
%tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
%tmp174 = load float, float addrspace(3)* %tmp173, align 4
%tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
%tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
%tmp177 = load float, float addrspace(3)* %tmp176, align 4
%tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
%tmp179 = load float, float addrspace(3)* %tmp178, align 4
%tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
%tmp181 = load float, float addrspace(3)* %tmp180, align 4
%tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
%tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
%tmp184 = load float, float addrspace(3)* %tmp183, align 4
%tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
%tmp186 = load float, float addrspace(3)* %tmp185, align 4
%tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
%tmp188 = load float, float addrspace(3)* %tmp187, align 4
%tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
%tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
%tmp191 = load float, float addrspace(3)* %tmp190, align 4
%tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
%tmp193 = load float, float addrspace(3)* %tmp192, align 4
%tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
%tmp195 = load float, float addrspace(3)* %tmp194, align 4
%tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
%tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
%tmp198 = load float, float addrspace(3)* %tmp197, align 4
%tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
%tmp200 = load float, float addrspace(3)* %tmp199, align 4
%tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
%tmp202 = load float, float addrspace(3)* %tmp201, align 4
%tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
%tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
%tmp205 = load float, float addrspace(3)* %tmp204, align 4
%tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
%tmp207 = load float, float addrspace(3)* %tmp206, align 4
%tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
%tmp209 = load float, float addrspace(3)* %tmp208, align 4
%tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
%tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
%tmp212 = load float, float addrspace(3)* %tmp211, align 4
%tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
%tmp214 = load float, float addrspace(3)* %tmp213, align 4
%tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
%tmp216 = load float, float addrspace(3)* %tmp215, align 4
%tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
%tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
%tmp219 = load float, float addrspace(3)* %tmp218, align 4
%tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
%tmp221 = load float, float addrspace(3)* %tmp220, align 4
%tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
%tmp223 = load float, float addrspace(3)* %tmp222, align 4
%tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
%tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
%tmp226 = load float, float addrspace(3)* %tmp225, align 4
%tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
%tmp228 = load float, float addrspace(3)* %tmp227, align 4
%tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
%tmp230 = load float, float addrspace(3)* %tmp229, align 4
%tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
%tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
%tmp233 = load float, float addrspace(3)* %tmp232, align 4
%tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
%tmp235 = load float, float addrspace(3)* %tmp234, align 4
%tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
%tmp237 = load float, float addrspace(3)* %tmp236, align 4
%tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
%tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
%tmp240 = load float, float addrspace(3)* %tmp239, align 4
%tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
%tmp242 = load float, float addrspace(3)* %tmp241, align 4
%tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
%tmp244 = load float, float addrspace(3)* %tmp243, align 4
%tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
%tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
%tmp247 = load float, float addrspace(3)* %tmp246, align 4
%tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
%tmp249 = load float, float addrspace(3)* %tmp248, align 4
%tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
%tmp251 = load float, float addrspace(3)* %tmp250, align 4
%tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
%tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
%tmp254 = load float, float addrspace(3)* %tmp253, align 4
%tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
%tmp256 = load float, float addrspace(3)* %tmp255, align 4
%tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
%tmp258 = load float, float addrspace(3)* %tmp257, align 4
%tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
%tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
%tmp261 = load float, float addrspace(3)* %tmp260, align 4
%tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
%tmp263 = load float, float addrspace(3)* %tmp262, align 4
%tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
%tmp265 = load float, float addrspace(3)* %tmp264, align 4
%tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
%tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
%tmp268 = load float, float addrspace(3)* %tmp267, align 4
%tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
%tmp270 = load float, float addrspace(3)* %tmp269, align 4
%tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
%tmp272 = load float, float addrspace(3)* %tmp271, align 4
%tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
%tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
%tmp275 = load float, float addrspace(3)* %tmp274, align 4
%tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
%tmp277 = load float, float addrspace(3)* %tmp276, align 4
%tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
%tmp279 = load float, float addrspace(3)* %tmp278, align 4
%tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
%tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
%tmp282 = load float, float addrspace(3)* %tmp281, align 4
%tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
%tmp284 = load float, float addrspace(3)* %tmp283, align 4
%tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
%tmp286 = load float, float addrspace(3)* %tmp285, align 4
%tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
%tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
%tmp289 = load float, float addrspace(3)* %tmp288, align 4
%tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
%tmp291 = load float, float addrspace(3)* %tmp290, align 4
%tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
%tmp293 = load float, float addrspace(3)* %tmp292, align 4
%tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
%tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
%tmp296 = load float, float addrspace(3)* %tmp295, align 4
%tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
%tmp298 = load float, float addrspace(3)* %tmp297, align 4
%tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
%tmp300 = load float, float addrspace(3)* %tmp299, align 4
%tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
%tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
%tmp303 = load float, float addrspace(3)* %tmp302, align 4
%tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
%tmp305 = load float, float addrspace(3)* %tmp304, align 4
%tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
%tmp307 = load float, float addrspace(3)* %tmp306, align 4
%tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
%tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
%tmp310 = load float, float addrspace(3)* %tmp309, align 4
%tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
%tmp312 = load float, float addrspace(3)* %tmp311, align 4
%tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
%tmp314 = load float, float addrspace(3)* %tmp313, align 4
%tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
%tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
%tmp317 = load float, float addrspace(3)* %tmp316, align 4
%tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
%tmp319 = load float, float addrspace(3)* %tmp318, align 4
%tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
%tmp321 = load float, float addrspace(3)* %tmp320, align 4
%tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
%tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
%tmp324 = load float, float addrspace(3)* %tmp323, align 4
%tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
%tmp326 = load float, float addrspace(3)* %tmp325, align 4
%tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
%tmp328 = load float, float addrspace(3)* %tmp327, align 4
%tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
%tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
%tmp331 = load float, float addrspace(3)* %tmp330, align 4
%tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
%tmp333 = load float, float addrspace(3)* %tmp332, align 4
%tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
%tmp335 = load float, float addrspace(3)* %tmp334, align 4
%tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
%tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
%tmp338 = load float, float addrspace(3)* %tmp337, align 4
%tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
%tmp340 = load float, float addrspace(3)* %tmp339, align 4
%tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
%tmp342 = load float, float addrspace(3)* %tmp341, align 4
%tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
%tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
%tmp345 = load float, float addrspace(3)* %tmp344, align 4
%tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
%tmp347 = load float, float addrspace(3)* %tmp346, align 4
%tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
%tmp349 = load float, float addrspace(3)* %tmp348, align 4
%tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
%tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
%tmp352 = load float, float addrspace(3)* %tmp351, align 4
%tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
%tmp354 = load float, float addrspace(3)* %tmp353, align 4
%tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
%tmp356 = load float, float addrspace(3)* %tmp355, align 4
%tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
%tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
%tmp359 = load float, float addrspace(3)* %tmp358, align 4
%tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
%tmp361 = load float, float addrspace(3)* %tmp360, align 4
%tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
%tmp363 = load float, float addrspace(3)* %tmp362, align 4
%tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
%tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
%tmp366 = load float, float addrspace(3)* %tmp365, align 4
%tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
%tmp368 = load float, float addrspace(3)* %tmp367, align 4
%tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
%tmp370 = load float, float addrspace(3)* %tmp369, align 4
%tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
%tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
%tmp373 = load float, float addrspace(3)* %tmp372, align 4
%tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
%tmp375 = load float, float addrspace(3)* %tmp374, align 4
%tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
%tmp377 = load float, float addrspace(3)* %tmp376, align 4
%tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
%tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
%tmp380 = load float, float addrspace(3)* %tmp379, align 4
%tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
%tmp382 = load float, float addrspace(3)* %tmp381, align 4
%tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
%tmp384 = load float, float addrspace(3)* %tmp383, align 4
%tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
%tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
%tmp387 = load float, float addrspace(3)* %tmp386, align 4
%tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
%tmp389 = load float, float addrspace(3)* %tmp388, align 4
%tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
%tmp391 = load float, float addrspace(3)* %tmp390, align 4
%tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
%tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
%tmp394 = load float, float addrspace(3)* %tmp393, align 4
%tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
%tmp396 = load float, float addrspace(3)* %tmp395, align 4
%tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
%tmp398 = load float, float addrspace(3)* %tmp397, align 4
%tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
%tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
%tmp401 = load float, float addrspace(3)* %tmp400, align 4
%tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
%tmp403 = load float, float addrspace(3)* %tmp402, align 4
%tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
%tmp405 = load float, float addrspace(3)* %tmp404, align 4
%tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
%tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
%tmp408 = load float, float addrspace(3)* %tmp407, align 4
%tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
%tmp410 = load float, float addrspace(3)* %tmp409, align 4
%tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
%tmp412 = load float, float addrspace(3)* %tmp411, align 4
%tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
%tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
%tmp415 = load float, float addrspace(3)* %tmp414, align 4
%tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
%tmp417 = load float, float addrspace(3)* %tmp416, align 4
%tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
%tmp419 = load float, float addrspace(3)* %tmp418, align 4
%tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
%tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
%tmp422 = load float, float addrspace(3)* %tmp421, align 4
%tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
%tmp424 = load float, float addrspace(3)* %tmp423, align 4
%tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
%tmp426 = load float, float addrspace(3)* %tmp425, align 4
%tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
%tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
%tmp429 = load float, float addrspace(3)* %tmp428, align 4
%tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
%tmp431 = load float, float addrspace(3)* %tmp430, align 4
%tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
%tmp433 = load float, float addrspace(3)* %tmp432, align 4
%tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
%tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
%tmp436 = load float, float addrspace(3)* %tmp435, align 4
%tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
%tmp438 = load float, float addrspace(3)* %tmp437, align 4
%tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
%tmp440 = load float, float addrspace(3)* %tmp439, align 4
%tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
%tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
%tmp443 = load float, float addrspace(3)* %tmp442, align 4
%tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
%tmp445 = load float, float addrspace(3)* %tmp444, align 4
%tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
%tmp447 = load float, float addrspace(3)* %tmp446, align 4
%tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
store float %tmp7, float addrspace(1)* %arg1, align 4
%tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
store float %tmp14, float addrspace(1)* %tmp449, align 4
%tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
store float %tmp21, float addrspace(1)* %tmp450, align 4
%tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
store float %tmp28, float addrspace(1)* %tmp451, align 4
%tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
store float %tmp35, float addrspace(1)* %tmp452, align 4
%tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
store float %tmp42, float addrspace(1)* %tmp453, align 4
%tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
store float %tmp49, float addrspace(1)* %tmp454, align 4
%tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
store float %tmp56, float addrspace(1)* %tmp455, align 4
%tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
store float %tmp63, float addrspace(1)* %tmp456, align 4
%tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
store float %tmp70, float addrspace(1)* %tmp457, align 4
%tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
store float %tmp77, float addrspace(1)* %tmp458, align 4
%tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
store float %tmp84, float addrspace(1)* %tmp459, align 4
%tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
store float %tmp91, float addrspace(1)* %tmp460, align 4
%tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
store float %tmp98, float addrspace(1)* %tmp461, align 4
%tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
store float %tmp105, float addrspace(1)* %tmp462, align 4
%tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
store float %tmp112, float addrspace(1)* %tmp463, align 4
%tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
store float %tmp119, float addrspace(1)* %tmp464, align 4
%tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
store float %tmp126, float addrspace(1)* %tmp465, align 4
%tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
store float %tmp133, float addrspace(1)* %tmp466, align 4
%tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
store float %tmp140, float addrspace(1)* %tmp467, align 4
%tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
store float %tmp147, float addrspace(1)* %tmp468, align 4
%tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
store float %tmp154, float addrspace(1)* %tmp469, align 4
%tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
store float %tmp161, float addrspace(1)* %tmp470, align 4
%tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
store float %tmp168, float addrspace(1)* %tmp471, align 4
%tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
store float %tmp175, float addrspace(1)* %tmp472, align 4
%tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
store float %tmp182, float addrspace(1)* %tmp473, align 4
%tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
store float %tmp189, float addrspace(1)* %tmp474, align 4
%tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
store float %tmp196, float addrspace(1)* %tmp475, align 4
%tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
store float %tmp203, float addrspace(1)* %tmp476, align 4
%tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
store float %tmp210, float addrspace(1)* %tmp477, align 4
%tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
store float %tmp217, float addrspace(1)* %tmp478, align 4
%tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
store float %tmp224, float addrspace(1)* %tmp479, align 4
%tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
store float %tmp231, float addrspace(1)* %tmp480, align 4
%tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
store float %tmp238, float addrspace(1)* %tmp481, align 4
%tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
store float %tmp245, float addrspace(1)* %tmp482, align 4
%tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
store float %tmp252, float addrspace(1)* %tmp483, align 4
%tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
store float %tmp259, float addrspace(1)* %tmp484, align 4
%tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
store float %tmp266, float addrspace(1)* %tmp485, align 4
%tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
store float %tmp273, float addrspace(1)* %tmp486, align 4
%tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
store float %tmp280, float addrspace(1)* %tmp487, align 4
%tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
store float %tmp287, float addrspace(1)* %tmp488, align 4
%tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
store float %tmp294, float addrspace(1)* %tmp489, align 4
%tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
store float %tmp301, float addrspace(1)* %tmp490, align 4
%tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
store float %tmp308, float addrspace(1)* %tmp491, align 4
%tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
store float %tmp315, float addrspace(1)* %tmp492, align 4
%tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
store float %tmp322, float addrspace(1)* %tmp493, align 4
%tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
store float %tmp329, float addrspace(1)* %tmp494, align 4
%tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
store float %tmp336, float addrspace(1)* %tmp495, align 4
%tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
store float %tmp343, float addrspace(1)* %tmp496, align 4
%tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
store float %tmp350, float addrspace(1)* %tmp497, align 4
%tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
store float %tmp357, float addrspace(1)* %tmp498, align 4
%tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
store float %tmp364, float addrspace(1)* %tmp499, align 4
%tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
store float %tmp371, float addrspace(1)* %tmp500, align 4
%tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
store float %tmp378, float addrspace(1)* %tmp501, align 4
%tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
store float %tmp385, float addrspace(1)* %tmp502, align 4
%tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
store float %tmp392, float addrspace(1)* %tmp503, align 4
%tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
store float %tmp399, float addrspace(1)* %tmp504, align 4
%tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
store float %tmp406, float addrspace(1)* %tmp505, align 4
%tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
store float %tmp413, float addrspace(1)* %tmp506, align 4
%tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
store float %tmp420, float addrspace(1)* %tmp507, align 4
%tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
store float %tmp427, float addrspace(1)* %tmp508, align 4
%tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
store float %tmp434, float addrspace(1)* %tmp509, align 4
%tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
store float %tmp441, float addrspace(1)* %tmp510, align 4
%tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
store float %tmp448, float addrspace(1)* %tmp511, align 4
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.fmuladd.f32(float, float, float) #1
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #1 = { nounwind readnone }