mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
[ARM] bottom-top mul support in ARMParallelDSP
On failing to find sequences that can be converted into dual macs, try to find sequential 16-bit loads that are used by muls which we can then use smultb, smulbt, smultt with a wide load. Differential Revision: https://reviews.llvm.org/D51983 llvm-svn: 342210
This commit is contained in:
parent
e8e7140cf4
commit
3764cb8a6f
@ -55,6 +55,7 @@ namespace {
|
||||
using ReductionList = SmallVector<Reduction, 8>;
|
||||
using ValueList = SmallVector<Value*, 8>;
|
||||
using MemInstList = SmallVector<Instruction*, 8>;
|
||||
using LoadInstList = SmallVector<LoadInst*, 8>;
|
||||
using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
|
||||
using PMACPairList = SmallVector<PMACPair, 8>;
|
||||
using Instructions = SmallVector<Instruction*,16>;
|
||||
@ -63,7 +64,8 @@ namespace {
|
||||
struct OpChain {
|
||||
Instruction *Root;
|
||||
ValueList AllValues;
|
||||
MemInstList VecLd; // List of all load instructions.
|
||||
MemInstList VecLd; // List of all sequential load instructions.
|
||||
LoadInstList Loads; // List of all load instructions.
|
||||
MemLocList MemLocs; // All memory locations read by this tree.
|
||||
bool ReadOnly = true;
|
||||
|
||||
@ -76,8 +78,10 @@ namespace {
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
if (I->mayWriteToMemory())
|
||||
ReadOnly = false;
|
||||
if (auto *Ld = dyn_cast<LoadInst>(V))
|
||||
if (auto *Ld = dyn_cast<LoadInst>(V)) {
|
||||
MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
|
||||
Loads.push_back(Ld);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -135,6 +139,7 @@ namespace {
|
||||
/// exchange the halfwords of the second operand before performing the
|
||||
/// arithmetic.
|
||||
bool MatchSMLAD(Function &F);
|
||||
bool MatchTopBottomMuls(BasicBlock *LoopBody);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
@ -203,6 +208,8 @@ namespace {
|
||||
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
|
||||
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
|
||||
Changes = MatchSMLAD(F);
|
||||
if (!Changes)
|
||||
Changes = MatchTopBottomMuls(Header);
|
||||
return Changes;
|
||||
}
|
||||
};
|
||||
@ -496,10 +503,10 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
|
||||
);
|
||||
}
|
||||
|
||||
static void AddMACCandidate(OpChainList &Candidates,
|
||||
static void AddMulCandidate(OpChainList &Candidates,
|
||||
Instruction *Mul,
|
||||
Value *MulOp0, Value *MulOp1) {
|
||||
LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
|
||||
LLVM_DEBUG(dbgs() << "OK, found mul:\t"; Mul->dump());
|
||||
assert(Mul->getOpcode() == Instruction::Mul &&
|
||||
"expected mul instruction");
|
||||
ValueList LHS;
|
||||
@ -533,14 +540,14 @@ static void MatchParallelMACSequences(Reduction &R,
|
||||
break;
|
||||
case Instruction::Mul:
|
||||
if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
|
||||
AddMACCandidate(Candidates, I, MulOp0, MulOp1);
|
||||
AddMulCandidate(Candidates, I, MulOp0, MulOp1);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Instruction::SExt:
|
||||
if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
|
||||
Instruction *Mul = cast<Instruction>(I->getOperand(0));
|
||||
AddMACCandidate(Candidates, Mul, MulOp0, MulOp1);
|
||||
AddMulCandidate(Candidates, Mul, MulOp0, MulOp1);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@ -569,23 +576,24 @@ static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
|
||||
// the memory locations accessed by the MAC-chains.
|
||||
// TODO: we need the read statements when we accept more complicated chains.
|
||||
static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
|
||||
Instructions &Writes, OpChainList &MACCandidates) {
|
||||
Instructions &Writes, OpChainList &Candidates) {
|
||||
LLVM_DEBUG(dbgs() << "Alias checks:\n");
|
||||
for (auto &MAC : MACCandidates) {
|
||||
LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
|
||||
for (auto &Candidate : Candidates) {
|
||||
LLVM_DEBUG(dbgs() << "mul: "; Candidate->Root->dump());
|
||||
Candidate->SetMemoryLocations();
|
||||
|
||||
// At the moment, we allow only simple chains that only consist of reads,
|
||||
// accumulate their result with an integer add, and thus that don't write
|
||||
// memory, and simply bail if they do.
|
||||
if (!MAC->ReadOnly)
|
||||
if (!Candidate->ReadOnly)
|
||||
return true;
|
||||
|
||||
// Now for all writes in the basic block, check that they don't alias with
|
||||
// the memory locations accessed by our MAC-chain:
|
||||
for (auto *I : Writes) {
|
||||
LLVM_DEBUG(dbgs() << "- "; I->dump());
|
||||
assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
|
||||
for (auto &MemLoc : MAC->MemLocs) {
|
||||
assert(Candidate->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
|
||||
for (auto &MemLoc : Candidate->MemLocs) {
|
||||
if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
|
||||
ModRefInfo::ModRef))) {
|
||||
LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
|
||||
@ -599,7 +607,7 @@ static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool CheckMACMemory(OpChainList &Candidates) {
|
||||
static bool CheckMulMemory(OpChainList &Candidates) {
|
||||
for (auto &C : Candidates) {
|
||||
// A mul has 2 operands, and a narrow op consist of sext and a load; thus
|
||||
// we expect at least 4 items in this operand value list.
|
||||
@ -607,7 +615,6 @@ static bool CheckMACMemory(OpChainList &Candidates) {
|
||||
LLVM_DEBUG(dbgs() << "Operand list too short.\n");
|
||||
return false;
|
||||
}
|
||||
C->SetMemoryLocations();
|
||||
ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
|
||||
ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
|
||||
|
||||
@ -620,6 +627,131 @@ static bool CheckMACMemory(OpChainList &Candidates) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst *BaseLoad,
|
||||
const Type *LoadTy) {
|
||||
const unsigned AddrSpace = BaseLoad->getPointerAddressSpace();
|
||||
|
||||
Value *VecPtr = IRB.CreateBitCast(BaseLoad->getPointerOperand(),
|
||||
LoadTy->getPointerTo(AddrSpace));
|
||||
return IRB.CreateAlignedLoad(VecPtr, BaseLoad->getAlignment());
|
||||
}
|
||||
|
||||
/// Attempt to widen loads and use smulbb, smulbt, smultb and smultt muls.
|
||||
// TODO: This, like smlad generation, expects the leave operands to be loads
|
||||
// that are sign extended. We should be able to handle scalar values as well
|
||||
// performing these muls on word x half types to generate smulwb and smulwt.
|
||||
bool ARMParallelDSP::MatchTopBottomMuls(BasicBlock *LoopBody) {
|
||||
LLVM_DEBUG(dbgs() << "Attempting to find BT|TB muls.\n");
|
||||
|
||||
OpChainList Candidates;
|
||||
for (auto &I : *LoopBody) {
|
||||
if (I.getOpcode() == Instruction::Mul) {
|
||||
if (I.getType()->getScalarSizeInBits() == 32 ||
|
||||
I.getType()->getScalarSizeInBits() == 64)
|
||||
AddMulCandidate(Candidates, &I, I.getOperand(0), I.getOperand(1));
|
||||
}
|
||||
}
|
||||
|
||||
if (Candidates.empty())
|
||||
return false;
|
||||
|
||||
Instructions Reads;
|
||||
Instructions Writes;
|
||||
AliasCandidates(LoopBody, Reads, Writes);
|
||||
|
||||
if (AreAliased(AA, Reads, Writes, Candidates))
|
||||
return false;
|
||||
|
||||
DenseMap<LoadInst*, Instruction*> LoadUsers;
|
||||
DenseMap<LoadInst*, LoadInst*> SeqLoads;
|
||||
SmallPtrSet<LoadInst*, 8> OffsetLoads;
|
||||
|
||||
for (unsigned i = 0; i < Candidates.size(); ++i) {
|
||||
for (unsigned j = 0; j < Candidates.size(); ++j) {
|
||||
if (i == j)
|
||||
continue;
|
||||
|
||||
OpChain *MulChain0 = Candidates[i].get();
|
||||
OpChain *MulChain1 = Candidates[j].get();
|
||||
|
||||
for (auto *Ld0 : MulChain0->Loads) {
|
||||
if (SeqLoads.count(Ld0) || OffsetLoads.count(Ld0))
|
||||
continue;
|
||||
|
||||
for (auto *Ld1 : MulChain1->Loads) {
|
||||
if (SeqLoads.count(Ld1) || OffsetLoads.count(Ld1))
|
||||
continue;
|
||||
|
||||
MemInstList VecMem;
|
||||
if (AreSequentialLoads(Ld0, Ld1, VecMem)) {
|
||||
SeqLoads[Ld0] = Ld1;
|
||||
OffsetLoads.insert(Ld1);
|
||||
LoadUsers[Ld0] = MulChain0->Root;
|
||||
LoadUsers[Ld1] = MulChain1->Root;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (SeqLoads.empty())
|
||||
return false;
|
||||
|
||||
IRBuilder<NoFolder> IRB(LoopBody);
|
||||
const Type *Ty = IntegerType::get(M->getContext(), 32);
|
||||
|
||||
// We know that at least one of the operands is a SExt of Ld.
|
||||
auto GetSExt = [](Instruction *I, LoadInst *Ld, unsigned OpIdx) -> Instruction* {
|
||||
if (!isa<Instruction>(I->getOperand(OpIdx)))
|
||||
return nullptr;
|
||||
|
||||
Value *SExt = nullptr;
|
||||
if (cast<Instruction>(I->getOperand(OpIdx))->getOperand(0) == Ld)
|
||||
SExt = I->getOperand(0);
|
||||
else
|
||||
SExt = I->getOperand(1);
|
||||
|
||||
return cast<Instruction>(SExt);
|
||||
};
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Found some sequential loads, now widening:\n");
|
||||
for (auto &Pair : SeqLoads) {
|
||||
LoadInst *BaseLd = Pair.first;
|
||||
LoadInst *OffsetLd = Pair.second;
|
||||
IRB.SetInsertPoint(BaseLd);
|
||||
LoadInst *WideLd = CreateLoadIns(IRB, BaseLd, Ty);
|
||||
LLVM_DEBUG(dbgs() << " - with base load: " << *BaseLd << "\n");
|
||||
LLVM_DEBUG(dbgs() << " - created wide load: " << *WideLd << "\n");
|
||||
Instruction *BaseUser = LoadUsers[BaseLd];
|
||||
Instruction *OffsetUser = LoadUsers[OffsetLd];
|
||||
|
||||
Instruction *BaseSExt = GetSExt(BaseUser, BaseLd, 0);
|
||||
if (!BaseSExt)
|
||||
BaseSExt = GetSExt(BaseUser, BaseLd, 1);
|
||||
Instruction *OffsetSExt = GetSExt(OffsetUser, OffsetLd, 0);
|
||||
if (!OffsetSExt)
|
||||
OffsetSExt = GetSExt(OffsetUser, OffsetLd, 1);
|
||||
|
||||
assert((BaseSExt && OffsetSExt) && "failed to find SExts");
|
||||
|
||||
// BaseUser needs to: (asr (shl WideLoad, 16), 16)
|
||||
// OffsetUser needs to: (asr WideLoad, 16)
|
||||
auto *Shl = cast<Instruction>(IRB.CreateShl(WideLd, 16));
|
||||
auto *Bottom = cast<Instruction>(IRB.CreateAShr(Shl, 16));
|
||||
auto *Top = cast<Instruction>(IRB.CreateAShr(WideLd, 16));
|
||||
BaseUser->replaceUsesOfWith(BaseSExt, Bottom);
|
||||
OffsetUser->replaceUsesOfWith(OffsetSExt, Top);
|
||||
|
||||
BaseSExt->eraseFromParent();
|
||||
OffsetSExt->eraseFromParent();
|
||||
BaseLd->eraseFromParent();
|
||||
OffsetLd->eraseFromParent();
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << "Block after top bottom mul replacements:\n"
|
||||
<< *LoopBody << "\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
|
||||
// multiplications.
|
||||
// To use SMLAD:
|
||||
@ -658,14 +790,15 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
dbgs() << "Header block:\n"; Header->dump();
|
||||
dbgs() << "Loop info:\n\n"; L->dump());
|
||||
|
||||
bool Changed = false;
|
||||
ReductionList Reductions;
|
||||
MatchReductions(F, L, Header, Reductions);
|
||||
if (Reductions.empty())
|
||||
return false;
|
||||
|
||||
for (auto &R : Reductions) {
|
||||
OpChainList MACCandidates;
|
||||
MatchParallelMACSequences(R, MACCandidates);
|
||||
if (!CheckMACMemory(MACCandidates))
|
||||
if (!CheckMulMemory(MACCandidates))
|
||||
continue;
|
||||
|
||||
R.MACCandidates = std::move(MACCandidates);
|
||||
@ -682,6 +815,7 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
Instructions Reads, Writes;
|
||||
AliasCandidates(Header, Reads, Writes);
|
||||
|
||||
bool Changed = false;
|
||||
for (auto &R : Reductions) {
|
||||
if (AreAliased(AA, Reads, Writes, R.MACCandidates))
|
||||
return false;
|
||||
@ -693,15 +827,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
return Changed;
|
||||
}
|
||||
|
||||
static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
|
||||
const Type *LoadTy) {
|
||||
const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
|
||||
|
||||
Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
|
||||
LoadTy->getPointerTo(AddrSpace));
|
||||
return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
|
||||
}
|
||||
|
||||
Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
|
||||
Instruction *Acc, bool Exchange,
|
||||
Instruction *InsertAfter) {
|
||||
@ -716,8 +841,8 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
|
||||
|
||||
// Replace the reduction chain with an intrinsic call
|
||||
const Type *Ty = IntegerType::get(M->getContext(), 32);
|
||||
LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
|
||||
LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
|
||||
LoadInst *NewLd0 = CreateLoadIns(Builder, &VecLd0[0], Ty);
|
||||
LoadInst *NewLd1 = CreateLoadIns(Builder, &VecLd1[0], Ty);
|
||||
Value* Args[] = { NewLd0, NewLd1, Acc };
|
||||
Function *SMLAD = nullptr;
|
||||
if (Exchange)
|
||||
|
209
test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll
Normal file
209
test/CodeGen/ARM/paralleldsp-top-bottom-neg.ll
Normal file
@ -0,0 +1,209 @@
|
||||
; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: topbottom_mul_alias
|
||||
; CHECK-NOT: bitcast i16*
|
||||
define void @topbottom_mul_alias(i32 %N, i32* nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
|
||||
%In1.0 = load i16, i16* %PIn1.0, align 2
|
||||
%SIn1.0 = sext i16 %In1.0 to i32
|
||||
%PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
|
||||
%In2.0 = load i16, i16* %PIn2.0, align 2
|
||||
%SIn2.0 = sext i16 %In2.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
|
||||
%Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
|
||||
store i32 %mul5.us.i.i, i32* %Out.0, align 4
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
|
||||
%In1.1 = load i16, i16* %PIn1.1, align 2
|
||||
%SIn1.1 = sext i16 %In1.1 to i32
|
||||
%PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
|
||||
%In2.1 = load i16, i16* %PIn2.1, align 2
|
||||
%SIn2.1 = sext i16 %In2.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
|
||||
%Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
|
||||
store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
|
||||
%In1.2 = load i16, i16* %PIn1.2, align 2
|
||||
%SIn1.2 = sext i16 %In1.2 to i32
|
||||
%PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
|
||||
%In2.2 = load i16, i16* %PIn2.2, align 2
|
||||
%SIn2.2 = sext i16 %In2.2 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
|
||||
%Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
|
||||
store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
|
||||
%In1.3 = load i16, i16* %PIn1.3, align 2
|
||||
%SIn1.3 = sext i16 %In1.3 to i32
|
||||
%PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
|
||||
%In2.3 = load i16, i16* %PIn2.3, align 2
|
||||
%SIn2.3 = sext i16 %In2.3 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
|
||||
%Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
|
||||
store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: We should be able to handle this by splatting the const value.
|
||||
; CHECK-LABEL: topbottom_mul_const
|
||||
; CHECK-NOT: bitcast i16*
|
||||
define void @topbottom_mul_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16 signext %const) {
|
||||
entry:
|
||||
%conv4.i.i = sext i16 %const to i32
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
|
||||
%In.0 = load i16, i16* %PIn.0, align 2
|
||||
%conv.us.i144.i = sext i16 %In.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
|
||||
%Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
|
||||
store i32 %mul5.us.i.i, i32* %Out.0, align 4
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
|
||||
%In.1 = load i16, i16* %PIn.1, align 2
|
||||
%conv.us.i144.1.i = sext i16 %In.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
|
||||
%Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
|
||||
store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
|
||||
%In.3 = load i16, i16* %PIn.2, align 2
|
||||
%conv.us.i144.2.i = sext i16 %In.3 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
|
||||
%Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
|
||||
store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
|
||||
%In.4 = load i16, i16* %PIn.3, align 2
|
||||
%conv.us.i144.3.i = sext i16 %In.4 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
|
||||
%Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
|
||||
store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: We should be able to handle this and use smulwt and smulwb.
|
||||
; CHECK-LABEL: topbottom_mul_word_load_const
|
||||
; CHECK-NOT: bitcast i16*
|
||||
define void @topbottom_mul_word_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i32* %C) {
|
||||
entry:
|
||||
%const = load i32, i32* %C
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
|
||||
%In.0 = load i16, i16* %PIn.0, align 2
|
||||
%conv.us.i144.i = sext i16 %In.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %const
|
||||
%Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
|
||||
store i32 %mul5.us.i.i, i32* %Out.0, align 4
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
|
||||
%In.1 = load i16, i16* %PIn.1, align 2
|
||||
%conv.us.i144.1.i = sext i16 %In.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %const
|
||||
%Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
|
||||
store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
|
||||
%In.3 = load i16, i16* %PIn.2, align 2
|
||||
%conv.us.i144.2.i = sext i16 %In.3 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %const
|
||||
%Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
|
||||
store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
|
||||
%In.4 = load i16, i16* %PIn.3, align 2
|
||||
%conv.us.i144.3.i = sext i16 %In.4 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %const
|
||||
%Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
|
||||
store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: topbottom_mul_8
|
||||
; CHECK-NOT: bitcast i16*
|
||||
define void @topbottom_mul_8(i32 %N, i32* noalias nocapture readnone %Out, i8* nocapture readonly %In1, i8* nocapture readonly %In2) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn1.0 = getelementptr inbounds i8, i8* %In1, i32 %iv
|
||||
%In1.0 = load i8, i8* %PIn1.0, align 1
|
||||
%SIn1.0 = sext i8 %In1.0 to i32
|
||||
%PIn2.0 = getelementptr inbounds i8, i8* %In2, i32 %iv
|
||||
%In2.0 = load i8, i8* %PIn2.0, align 1
|
||||
%SIn2.0 = sext i8 %In2.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
|
||||
%Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
|
||||
store i32 %mul5.us.i.i, i32* %Out.0, align 4
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn1.1 = getelementptr inbounds i8, i8* %In1, i32 %iv.1
|
||||
%In1.1 = load i8, i8* %PIn1.1, align 1
|
||||
%SIn1.1 = sext i8 %In1.1 to i32
|
||||
%PIn2.1 = getelementptr inbounds i8, i8* %In2, i32 %iv.1
|
||||
%In2.1 = load i8, i8* %PIn2.1, align 1
|
||||
%SIn2.1 = sext i8 %In2.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
|
||||
%Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
|
||||
store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn1.2 = getelementptr inbounds i8, i8* %In1, i32 %iv.2
|
||||
%In1.2 = load i8, i8* %PIn1.2, align 1
|
||||
%SIn1.2 = sext i8 %In1.2 to i32
|
||||
%PIn2.2 = getelementptr inbounds i8, i8* %In2, i32 %iv.2
|
||||
%In2.2 = load i8, i8* %PIn2.2, align 1
|
||||
%SIn2.2 = sext i8 %In2.2 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
|
||||
%Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
|
||||
store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn1.3 = getelementptr inbounds i8, i8* %In1, i32 %iv.3
|
||||
%In1.3 = load i8, i8* %PIn1.3, align 1
|
||||
%SIn1.3 = sext i8 %In1.3 to i32
|
||||
%PIn2.3 = getelementptr inbounds i8, i8* %In2, i32 %iv.3
|
||||
%In2.3 = load i8, i8* %PIn2.3, align 1
|
||||
%SIn2.3 = sext i8 %In2.3 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
|
||||
%Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
|
||||
store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
251
test/CodeGen/ARM/paralleldsp-top-bottom.ll
Normal file
251
test/CodeGen/ARM/paralleldsp-top-bottom.ll
Normal file
@ -0,0 +1,251 @@
|
||||
; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: topbottom_mul
|
||||
define void @topbottom_mul(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; CHECK: for.body:
|
||||
; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
|
||||
; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
|
||||
; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
|
||||
; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
|
||||
; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
|
||||
|
||||
; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
|
||||
; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
|
||||
; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
|
||||
; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
|
||||
; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
|
||||
|
||||
; CHECK: mul nsw i32 [[PIn1_0]], [[PIn2_0]]
|
||||
; CHECK: mul nsw i32 [[PIn1_1]], [[PIn2_1]]
|
||||
|
||||
; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
|
||||
; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
|
||||
; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
|
||||
; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
|
||||
; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
|
||||
|
||||
; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
|
||||
; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
|
||||
; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
|
||||
; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
|
||||
; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
|
||||
|
||||
; CHECK: mul nsw i32 [[PIn1_2]], [[PIn2_2]]
|
||||
; CHECK: mul nsw i32 [[PIn1_3]], [[PIn2_3]]
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
|
||||
%In1.0 = load i16, i16* %PIn1.0, align 2
|
||||
%SIn1.0 = sext i16 %In1.0 to i32
|
||||
%PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
|
||||
%In2.0 = load i16, i16* %PIn2.0, align 2
|
||||
%SIn2.0 = sext i16 %In2.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
|
||||
%Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
|
||||
store i32 %mul5.us.i.i, i32* %Out.0, align 4
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
|
||||
%In1.1 = load i16, i16* %PIn1.1, align 2
|
||||
%SIn1.1 = sext i16 %In1.1 to i32
|
||||
%PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
|
||||
%In2.1 = load i16, i16* %PIn2.1, align 2
|
||||
%SIn2.1 = sext i16 %In2.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
|
||||
%Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
|
||||
store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
|
||||
%In1.2 = load i16, i16* %PIn1.2, align 2
|
||||
%SIn1.2 = sext i16 %In1.2 to i32
|
||||
%PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
|
||||
%In2.2 = load i16, i16* %PIn2.2, align 2
|
||||
%SIn2.2 = sext i16 %In2.2 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
|
||||
%Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
|
||||
store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
|
||||
%In1.3 = load i16, i16* %PIn1.3, align 2
|
||||
%SIn1.3 = sext i16 %In1.3 to i32
|
||||
%PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
|
||||
%In2.3 = load i16, i16* %PIn2.3, align 2
|
||||
%SIn2.3 = sext i16 %In2.3 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
|
||||
%Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
|
||||
store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: topbottom_mul_load_const
|
||||
define void @topbottom_mul_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16* %C) {
|
||||
entry:
|
||||
%const = load i16, i16* %C
|
||||
%conv4.i.i = sext i16 %const to i32
|
||||
br label %for.body
|
||||
|
||||
; CHECK: for.body:
|
||||
; CHECK: [[Cast_PIn_0:%[^ ]+]] = bitcast i16* %PIn.0 to i32*
|
||||
; CHECK: [[PIn_01:%[^ ]+]] = load i32, i32* [[Cast_PIn_0]], align 2
|
||||
; CHECK: [[PIn_01_shl:%[^ ]+]] = shl i32 [[PIn_01]], 16
|
||||
; CHECK: [[PIn_0:%[^ ]+]] = ashr i32 [[PIn_01_shl]], 16
|
||||
; CHECK: [[PIn_1:%[^ ]+]] = ashr i32 [[PIn_01]], 16
|
||||
|
||||
; CHECK: mul nsw i32 [[PIn_0]], %conv4.i.i
|
||||
; CHECK: mul nsw i32 [[PIn_1]], %conv4.i.i
|
||||
|
||||
; CHECK: [[Cast_PIn_2:%[^ ]+]] = bitcast i16* %PIn.2 to i32*
|
||||
; CHECK: [[PIn_23:%[^ ]+]] = load i32, i32* [[Cast_PIn_2]], align 2
|
||||
; CHECK: [[PIn_23_shl:%[^ ]+]] = shl i32 [[PIn_23]], 16
|
||||
; CHECK: [[PIn_2:%[^ ]+]] = ashr i32 [[PIn_23_shl]], 16
|
||||
; CHECK: [[PIn_3:%[^ ]+]] = ashr i32 [[PIn_23]], 16
|
||||
|
||||
; CHECK: mul nsw i32 [[PIn_2]], %conv4.i.i
|
||||
; CHECK: mul nsw i32 [[PIn_3]], %conv4.i.i
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
|
||||
%In.0 = load i16, i16* %PIn.0, align 2
|
||||
%conv.us.i144.i = sext i16 %In.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
|
||||
%Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
|
||||
store i32 %mul5.us.i.i, i32* %Out.0, align 4
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
|
||||
%In.1 = load i16, i16* %PIn.1, align 2
|
||||
%conv.us.i144.1.i = sext i16 %In.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
|
||||
%Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
|
||||
store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
|
||||
%In.3 = load i16, i16* %PIn.2, align 2
|
||||
%conv.us.i144.2.i = sext i16 %In.3 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
|
||||
%Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
|
||||
store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
|
||||
%In.4 = load i16, i16* %PIn.3, align 2
|
||||
%conv.us.i144.3.i = sext i16 %In.4 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
|
||||
%Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
|
||||
store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: topbottom_mul_64
|
||||
define void @topbottom_mul_64(i32 %N, i64* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; CHECK: for.body:
|
||||
; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
|
||||
; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
|
||||
; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
|
||||
; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
|
||||
; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
|
||||
|
||||
; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
|
||||
; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
|
||||
; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
|
||||
; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
|
||||
; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
|
||||
|
||||
; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[PIn1_0]], [[PIn2_0]]
|
||||
; CHECK: [[SMul0:%[^ ]+]] = sext i32 [[Mul0]] to i64
|
||||
; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[PIn1_1]], [[PIn2_1]]
|
||||
; CHECK: [[SMul1:%[^ ]+]] = sext i32 [[Mul1]] to i64
|
||||
; CHECK: add i64 [[SMul0]], [[SMul1]]
|
||||
|
||||
; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
|
||||
; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
|
||||
; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
|
||||
; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
|
||||
; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
|
||||
|
||||
; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
|
||||
; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
|
||||
; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
|
||||
; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
|
||||
; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
|
||||
|
||||
; CHECK: [[Mul2:%[^ ]+]] = mul nsw i32 [[PIn1_2]], [[PIn2_2]]
|
||||
; CHECK: [[SMul2:%[^ ]+]] = sext i32 [[Mul2]] to i64
|
||||
; CHECK: [[Mul3:%[^ ]+]] = mul nsw i32 [[PIn1_3]], [[PIn2_3]]
|
||||
; CHECK: [[SMul3:%[^ ]+]] = sext i32 [[Mul3]] to i64
|
||||
; CHECK: add i64 [[SMul2]], [[SMul3]]
|
||||
|
||||
for.body:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%iv.out = phi i32 [ 0, %entry] , [ %iv.out.next, %for.body ]
|
||||
%count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
|
||||
%PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
|
||||
%In1.0 = load i16, i16* %PIn1.0, align 2
|
||||
%SIn1.0 = sext i16 %In1.0 to i32
|
||||
%PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
|
||||
%In2.0 = load i16, i16* %PIn2.0, align 2
|
||||
%SIn2.0 = sext i16 %In2.0 to i32
|
||||
%mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
|
||||
%sext.0 = sext i32 %mul5.us.i.i to i64
|
||||
%iv.1 = or i32 %iv, 1
|
||||
%PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
|
||||
%In1.1 = load i16, i16* %PIn1.1, align 2
|
||||
%SIn1.1 = sext i16 %In1.1 to i32
|
||||
%PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
|
||||
%In2.1 = load i16, i16* %PIn2.1, align 2
|
||||
%SIn2.1 = sext i16 %In2.1 to i32
|
||||
%mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
|
||||
%sext.1 = sext i32 %mul5.us.i.1.i to i64
|
||||
%mac.0 = add i64 %sext.0, %sext.1
|
||||
%Out.0 = getelementptr inbounds i64, i64* %Out, i32 %iv.out
|
||||
store i64 %mac.0, i64* %Out.0, align 4
|
||||
%iv.2 = or i32 %iv, 2
|
||||
%PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
|
||||
%In1.2 = load i16, i16* %PIn1.2, align 2
|
||||
%SIn1.2 = sext i16 %In1.2 to i32
|
||||
%PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
|
||||
%In2.2 = load i16, i16* %PIn2.2, align 2
|
||||
%SIn2.2 = sext i16 %In2.2 to i32
|
||||
%mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
|
||||
%sext.2 = sext i32 %mul5.us.i.2.i to i64
|
||||
%iv.3 = or i32 %iv, 3
|
||||
%PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
|
||||
%In1.3 = load i16, i16* %PIn1.3, align 2
|
||||
%SIn1.3 = sext i16 %In1.3 to i32
|
||||
%PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
|
||||
%In2.3 = load i16, i16* %PIn2.3, align 2
|
||||
%SIn2.3 = sext i16 %In2.3 to i32
|
||||
%mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
|
||||
%sext.3 = sext i32 %mul5.us.i.3.i to i64
|
||||
%mac.1 = add i64 %sext.2, %sext.3
|
||||
%iv.out.1 = or i32 %iv.out, 1
|
||||
%Out.1 = getelementptr inbounds i64, i64* %Out, i32 %iv.out.1
|
||||
store i64 %mac.1, i64* %Out.1, align 4
|
||||
%iv.next = add i32 %iv, 4
|
||||
%iv.out.next = add i32 %iv.out, 2
|
||||
%count.next = add i32 %count, -4
|
||||
%niter375.ncmp.3.i = icmp eq i32 %count.next, 0
|
||||
br i1 %niter375.ncmp.3.i, label %exit, label %for.body
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user