mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
Revert r367389 (and follow-up r368404); it caused PR43073.
llvm-svn: 369567
This commit is contained in:
parent
f2a0708d5e
commit
703d55e86d
@ -1,4 +1,4 @@
|
||||
//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
|
||||
//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
@ -18,10 +18,13 @@
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
||||
#include "llvm/Analysis/LoopPass.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/NoFolder.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/LoopUtils.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/PassRegistry.h"
|
||||
#include "llvm/PassSupport.h"
|
||||
@ -68,7 +71,7 @@ namespace {
|
||||
}
|
||||
|
||||
LoadInst *getBaseLoad() const {
|
||||
return VecLd.front();
|
||||
return cast<LoadInst>(LHS);
|
||||
}
|
||||
};
|
||||
|
||||
@ -155,11 +158,13 @@ namespace {
|
||||
}
|
||||
};
|
||||
|
||||
class ARMParallelDSP : public FunctionPass {
|
||||
class ARMParallelDSP : public LoopPass {
|
||||
ScalarEvolution *SE;
|
||||
AliasAnalysis *AA;
|
||||
TargetLibraryInfo *TLI;
|
||||
DominatorTree *DT;
|
||||
LoopInfo *LI;
|
||||
Loop *L;
|
||||
const DataLayout *DL;
|
||||
Module *M;
|
||||
std::map<LoadInst*, LoadInst*> LoadPairs;
|
||||
@ -180,38 +185,63 @@ namespace {
|
||||
/// products to a 32-bit accumulate operand. Optionally, the instruction can
|
||||
/// exchange the halfwords of the second operand before performing the
|
||||
/// arithmetic.
|
||||
bool MatchSMLAD(Function &F);
|
||||
bool MatchSMLAD(Loop *L);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
ARMParallelDSP() : FunctionPass(ID) { }
|
||||
ARMParallelDSP() : LoopPass(ID) { }
|
||||
|
||||
bool doInitialization(Loop *L, LPPassManager &LPM) override {
|
||||
LoadPairs.clear();
|
||||
WideLoads.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
FunctionPass::getAnalysisUsage(AU);
|
||||
LoopPass::getAnalysisUsage(AU);
|
||||
AU.addRequired<AssumptionCacheTracker>();
|
||||
AU.addRequired<ScalarEvolutionWrapperPass>();
|
||||
AU.addRequired<AAResultsWrapperPass>();
|
||||
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<TargetPassConfig>();
|
||||
AU.addPreserved<ScalarEvolutionWrapperPass>();
|
||||
AU.addPreserved<GlobalsAAWrapperPass>();
|
||||
AU.addPreserved<LoopInfoWrapperPass>();
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
|
||||
if (DisableParallelDSP)
|
||||
return false;
|
||||
if (skipFunction(F))
|
||||
if (skipLoop(TheLoop))
|
||||
return false;
|
||||
|
||||
L = TheLoop;
|
||||
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
||||
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
|
||||
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
auto &TPC = getAnalysis<TargetPassConfig>();
|
||||
|
||||
BasicBlock *Header = TheLoop->getHeader();
|
||||
if (!Header)
|
||||
return false;
|
||||
|
||||
// TODO: We assume the loop header and latch to be the same block.
|
||||
// This is not a fundamental restriction, but lifting this would just
|
||||
// require more work to do the transformation and then patch up the CFG.
|
||||
if (Header != TheLoop->getLoopLatch()) {
|
||||
LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
|
||||
"running pass ARMParallelDSP\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!TheLoop->getLoopPreheader())
|
||||
InsertPreheaderForLoop(L, DT, LI, nullptr, true);
|
||||
|
||||
Function &F = *Header->getParent();
|
||||
M = F.getParent();
|
||||
DL = &M->getDataLayout();
|
||||
|
||||
@ -236,10 +266,17 @@ namespace {
|
||||
return false;
|
||||
}
|
||||
|
||||
LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
|
||||
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
|
||||
|
||||
bool Changes = MatchSMLAD(F);
|
||||
if (!RecordMemoryOps(Header)) {
|
||||
LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Changes = MatchSMLAD(L);
|
||||
return Changes;
|
||||
}
|
||||
};
|
||||
@ -300,8 +337,6 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
|
||||
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
|
||||
SmallVector<LoadInst*, 8> Loads;
|
||||
SmallVector<Instruction*, 8> Writes;
|
||||
LoadPairs.clear();
|
||||
WideLoads.clear();
|
||||
|
||||
// Collect loads and instruction that may write to memory. For now we only
|
||||
// record loads which are simple, sign-extended and have a single user.
|
||||
@ -379,7 +414,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
|
||||
return LoadPairs.size() > 1;
|
||||
}
|
||||
|
||||
// The pass needs to identify integer add/sub reductions of 16-bit vector
|
||||
// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
|
||||
// multiplications.
|
||||
// To use SMLAD:
|
||||
// 1) we first need to find integer add then look for this pattern:
|
||||
@ -410,13 +445,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
|
||||
// If loop invariants are used instead of loads, these need to be packed
|
||||
// before the loop begins.
|
||||
//
|
||||
bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
bool ARMParallelDSP::MatchSMLAD(Loop *L) {
|
||||
// Search recursively back through the operands to find a tree of values that
|
||||
// form a multiply-accumulate chain. The search records the Add and Mul
|
||||
// instructions that form the reduction and allows us to find a single value
|
||||
// to be used as the initial input to the accumlator.
|
||||
std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
|
||||
(Value *V, BasicBlock *BB, Reduction &R) -> bool {
|
||||
std::function<bool(Value*, Reduction&)> Search = [&]
|
||||
(Value *V, Reduction &R) -> bool {
|
||||
|
||||
// If we find a non-instruction, try to use it as the initial accumulator
|
||||
// value. This may have already been found during the search in which case
|
||||
@ -425,9 +460,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
if (!I)
|
||||
return R.InsertAcc(V);
|
||||
|
||||
if (I->getParent() != BB)
|
||||
return false;
|
||||
|
||||
switch (I->getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
@ -438,8 +470,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
// Adds should be adding together two muls, or another add and a mul to
|
||||
// be within the mac chain. One of the operands may also be the
|
||||
// accumulator value at which point we should stop searching.
|
||||
bool ValidLHS = Search(I->getOperand(0), BB, R);
|
||||
bool ValidRHS = Search(I->getOperand(1), BB, R);
|
||||
bool ValidLHS = Search(I->getOperand(0), R);
|
||||
bool ValidRHS = Search(I->getOperand(1), R);
|
||||
if (!ValidLHS && !ValidLHS)
|
||||
return false;
|
||||
else if (ValidLHS && ValidRHS) {
|
||||
@ -465,40 +497,36 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
||||
return false;
|
||||
}
|
||||
case Instruction::SExt:
|
||||
return Search(I->getOperand(0), BB, R);
|
||||
return Search(I->getOperand(0), R);
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
bool Changed = false;
|
||||
SmallPtrSet<Instruction*, 4> AllAdds;
|
||||
BasicBlock *Latch = L->getLoopLatch();
|
||||
|
||||
for (auto &BB : F) {
|
||||
SmallPtrSet<Instruction*, 4> AllAdds;
|
||||
if (!RecordMemoryOps(&BB))
|
||||
for (Instruction &I : reverse(*Latch)) {
|
||||
if (I.getOpcode() != Instruction::Add)
|
||||
continue;
|
||||
|
||||
for (Instruction &I : reverse(BB)) {
|
||||
if (I.getOpcode() != Instruction::Add)
|
||||
continue;
|
||||
if (AllAdds.count(&I))
|
||||
continue;
|
||||
|
||||
if (AllAdds.count(&I))
|
||||
continue;
|
||||
const auto *Ty = I.getType();
|
||||
if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
|
||||
continue;
|
||||
|
||||
const auto *Ty = I.getType();
|
||||
if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
|
||||
continue;
|
||||
Reduction R(&I);
|
||||
if (!Search(&I, R))
|
||||
continue;
|
||||
|
||||
Reduction R(&I);
|
||||
if (!Search(&I, &BB, R))
|
||||
continue;
|
||||
if (!CreateParallelPairs(R))
|
||||
continue;
|
||||
|
||||
if (!CreateParallelPairs(R))
|
||||
continue;
|
||||
|
||||
InsertParallelMACs(R);
|
||||
Changed = true;
|
||||
AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
|
||||
}
|
||||
InsertParallelMACs(R);
|
||||
Changed = true;
|
||||
AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
|
||||
}
|
||||
|
||||
return Changed;
|
||||
@ -696,15 +724,13 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
|
||||
// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
|
||||
// TODO: Support big-endian as well.
|
||||
Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
|
||||
Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
|
||||
BaseSExt->replaceAllUsesWith(NewBaseSExt);
|
||||
BaseSExt->setOperand(0, Bottom);
|
||||
|
||||
IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
|
||||
Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
|
||||
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
|
||||
Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
|
||||
Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
|
||||
OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
|
||||
OffsetSExt->setOperand(0, Trunc);
|
||||
|
||||
WideLoads.emplace(std::make_pair(Base,
|
||||
std::make_unique<WidenedLoad>(Loads, WideLoad)));
|
||||
@ -718,6 +744,6 @@ Pass *llvm::createARMParallelDSPPass() {
|
||||
char ARMParallelDSP::ID = 0;
|
||||
|
||||
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
|
||||
"Transform functions to use DSP intrinsics", false, false)
|
||||
"Transform loops to use DSP intrinsics", false, false)
|
||||
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
|
||||
"Transform functions to use DSP intrinsics", false, false)
|
||||
"Transform loops to use DSP intrinsics", false, false)
|
||||
|
@ -37,7 +37,8 @@
|
||||
; CHECK-NEXT: Scalar Evolution Analysis
|
||||
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: Transform functions to use DSP intrinsics
|
||||
; CHECK-NEXT: Loop Pass Manager
|
||||
; CHECK-NEXT: Transform loops to use DSP intrinsics
|
||||
; CHECK-NEXT: Interleaved Access Pass
|
||||
; CHECK-NEXT: ARM IR optimizations
|
||||
; CHECK-NEXT: Dominator Tree Construction
|
||||
|
@ -1,79 +0,0 @@
|
||||
; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: single_block
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
|
||||
define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: multi_block
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
|
||||
define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
br label %bb.1
|
||||
|
||||
bb.1:
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: multi_block_1
|
||||
; CHECK-NOT: call i32 @llvm.arm.smlad
|
||||
define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
br label %bb.1
|
||||
|
||||
bb.1:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
@ -1,329 +0,0 @@
|
||||
; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: exchange_1
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
|
||||
define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.1
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.0
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_2
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
|
||||
define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.b.1, %sext.a.0
|
||||
%mul.1 = mul i32 %sext.b.0, %sext.a.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_3
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
|
||||
define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.1
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.0
|
||||
%add = add i32 %mul.1, %mul.0
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_4
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
|
||||
define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.b.1, %sext.a.0
|
||||
%mul.1 = mul i32 %sext.b.0, %sext.a.1
|
||||
%add = add i32 %mul.1, %mul.0
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_multi_use_1
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
|
||||
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
|
||||
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
|
||||
; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])
|
||||
define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.1
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.0
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.a.3 = getelementptr i16, i16* %a, i32 3
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.a.3 = load i16, i16* %addr.a.3
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.a.3 = sext i16 %ld.a.3 to i32
|
||||
%mul.2 = mul i32 %sext.a.3, %sext.b.1
|
||||
%mul.3 = mul i32 %sext.a.2, %sext.b.0
|
||||
%add.1 = add i32 %mul.2, %mul.3
|
||||
%add.2 = add i32 %add, %add.1
|
||||
%res = add i32 %add.2, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_multi_use_2
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
|
||||
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
|
||||
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
|
||||
; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])
|
||||
define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.a.3 = getelementptr i16, i16* %a, i32 3
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.a.3 = load i16, i16* %addr.a.3
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.a.3 = sext i16 %ld.a.3 to i32
|
||||
%mul.2 = mul i32 %sext.b.0, %sext.a.3
|
||||
%mul.3 = mul i32 %sext.b.1, %sext.a.2
|
||||
%add.1 = add i32 %mul.2, %mul.3
|
||||
%add.2 = add i32 %add, %add.1
|
||||
%res = add i32 %add.2, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; TODO: Why aren't two intrinsics generated?
|
||||
; CHECK-LABEL: exchange_multi_use_3
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
|
||||
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
|
||||
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
|
||||
; CHECK-NOT: call i32 @llvm.arm.smlad
|
||||
; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0
|
||||
define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.a.3 = getelementptr i16, i16* %a, i32 3
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.a.3 = load i16, i16* %addr.a.3
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.a.3 = sext i16 %ld.a.3 to i32
|
||||
%mul.2 = mul i32 %sext.b.0, %sext.a.3
|
||||
%mul.3 = mul i32 %sext.b.1, %sext.a.2
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%add.1 = add i32 %mul.2, %mul.3
|
||||
%sub = sub i32 %add, %add.1
|
||||
%res = add i32 %acc, %sub
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; TODO: Why isn't smladx generated too?
|
||||
; CHECK-LABEL: exchange_multi_use_4
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0
|
||||
; CHECK-NOT: call i32 @llvm.arm.smlad
|
||||
define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.a.3 = getelementptr i16, i16* %a, i32 3
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.a.3 = load i16, i16* %addr.a.3
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.a.3 = sext i16 %ld.a.3 to i32
|
||||
%mul.2 = mul i32 %sext.b.0, %sext.a.3
|
||||
%mul.3 = mul i32 %sext.b.1, %sext.a.2
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%add.1 = add i32 %mul.2, %mul.3
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%sub = sub i32 %add, %add.1
|
||||
%res = add i32 %acc, %sub
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_swap
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
|
||||
define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.1, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.0, %sext.b.1
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_swap_2
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
|
||||
define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.1, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.0, %sext.b.1
|
||||
%add = add i32 %mul.1, %mul.0
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: exchange_swap_3
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
|
||||
define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.b.0, %sext.a.1
|
||||
%mul.1 = mul i32 %sext.b.1, %sext.a.0
|
||||
%add = add i32 %mul.1, %mul.0
|
||||
%res = add i32 %add, %acc
|
||||
ret i32 %res
|
||||
}
|
@ -1,161 +0,0 @@
|
||||
; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: overlap_1
|
||||
; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
|
||||
; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
|
||||
define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.b.2 = getelementptr i16, i16* %b, i32 2
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.b.2 = load i16, i16* %addr.b.2
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.b.2 = sext i16 %ld.b.2 to i32
|
||||
%mul.2 = mul i32 %sext.a.2, %sext.b.2
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%add.1 = add i32 %mul.1, %mul.2
|
||||
%add.2 = add i32 %add.1, %add
|
||||
%res = add i32 %add.2, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: overlap_2
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
|
||||
define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.b.2 = getelementptr i16, i16* %b, i32 2
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.b.2 = load i16, i16* %addr.b.2
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.b.2 = sext i16 %ld.b.2 to i32
|
||||
%mul.2 = mul i32 %sext.b.2, %sext.a.2
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%add.1 = add i32 %mul.1, %mul.2
|
||||
%add.2 = add i32 %add, %add.1
|
||||
%res = add i32 %add.2, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: overlap_3
|
||||
; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
|
||||
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
|
||||
; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
|
||||
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
|
||||
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
|
||||
; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
|
||||
define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.b.2 = getelementptr i16, i16* %b, i32 2
|
||||
%addr.a.3 = getelementptr i16, i16* %a, i32 3
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.b.2 = load i16, i16* %addr.b.2
|
||||
%ld.a.3 = load i16, i16* %addr.a.3
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.b.2 = sext i16 %ld.b.2 to i32
|
||||
%sext.a.3 = sext i16 %ld.a.3 to i32
|
||||
%mul.2 = mul i32 %sext.a.2, %sext.b.1
|
||||
%mul.3 = mul i32 %sext.a.3, %sext.b.2
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%add.1 = add i32 %mul.2, %mul.3
|
||||
%add.2 = add i32 %add.1, %add
|
||||
%res = add i32 %add.2, %acc
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: overlap_4
|
||||
; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
|
||||
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
|
||||
; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
|
||||
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
|
||||
; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
|
||||
; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
|
||||
; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
|
||||
; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
|
||||
; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
|
||||
; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
|
||||
; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc)
|
||||
; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]])
|
||||
define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {
|
||||
entry:
|
||||
%addr.a.1 = getelementptr i16, i16* %a, i32 1
|
||||
%addr.b.1 = getelementptr i16, i16* %b, i32 1
|
||||
%ld.a.0 = load i16, i16* %a
|
||||
%sext.a.0 = sext i16 %ld.a.0 to i32
|
||||
%ld.b.0 = load i16, i16* %b
|
||||
%ld.a.1 = load i16, i16* %addr.a.1
|
||||
%ld.b.1 = load i16, i16* %addr.b.1
|
||||
%sext.a.1 = sext i16 %ld.a.1 to i32
|
||||
%sext.b.1 = sext i16 %ld.b.1 to i32
|
||||
%sext.b.0 = sext i16 %ld.b.0 to i32
|
||||
%mul.0 = mul i32 %sext.a.0, %sext.b.0
|
||||
%mul.1 = mul i32 %sext.a.1, %sext.b.1
|
||||
%addr.a.2 = getelementptr i16, i16* %a, i32 2
|
||||
%addr.b.2 = getelementptr i16, i16* %b, i32 2
|
||||
%addr.a.3 = getelementptr i16, i16* %a, i32 3
|
||||
%ld.a.2 = load i16, i16* %addr.a.2
|
||||
%ld.b.2 = load i16, i16* %addr.b.2
|
||||
%ld.a.3 = load i16, i16* %addr.a.3
|
||||
%sext.a.2 = sext i16 %ld.a.2 to i32
|
||||
%sext.b.2 = sext i16 %ld.b.2 to i32
|
||||
%sext.a.3 = sext i16 %ld.a.3 to i32
|
||||
%mul.2 = mul i32 %sext.b.2, %sext.a.2
|
||||
%mul.3 = mul i32 %sext.b.1, %sext.a.3
|
||||
%add = add i32 %mul.0, %mul.1
|
||||
%add.1 = add i32 %mul.2, %mul.3
|
||||
%add.2 = add i32 %add.1, %add
|
||||
%res = add i32 %add.2, %acc
|
||||
ret i32 %res
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
;
|
||||
; The loop header is not the loop latch.
|
||||
;
|
||||
; CHECK: call i32 @llvm.arm.smlad
|
||||
; CHECK-NOT: call i32 @llvm.arm.smlad
|
||||
;
|
||||
define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
|
||||
entry:
|
||||
|
Loading…
x
Reference in New Issue
Block a user