1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[ARM][MVE] Optimise offset addresses of gathers/scatters

This patch adds an analysis of the offset addresses used by gathers
and scatters to the MVEGatherScatterLowering pass to find
multiplications and additions that are loop invariant and thus can
be moved into the loop preheader, avoiding to execute them each time.

Differential Revision: https://reviews.llvm.org/D76681
This commit is contained in:
Anna Welker 2020-04-08 11:43:55 +01:00
parent 986d90607e
commit 5cfdb4f2d2
4 changed files with 1349 additions and 20 deletions

View File

@ -37,6 +37,7 @@
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
@ -67,6 +68,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<TargetPassConfig>();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
@ -83,7 +85,7 @@ private:
// Compute the scale of this gather/scatter instruction
int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
bool lowerGather(IntrinsicInst *I);
Value *lowerGather(IntrinsicInst *I);
// Create a gather from a base + vector of offsets
Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
Instruction *&Root, IRBuilder<> &Builder);
@ -91,13 +93,22 @@ private:
Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
IRBuilder<> &Builder);
bool lowerScatter(IntrinsicInst *I);
Value *lowerScatter(IntrinsicInst *I);
// Create a scatter to a base + vector of offsets
Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Ptr,
Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
IRBuilder<> &Builder);
// Create a scatter to a vector of pointers
Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
IRBuilder<> &Builder);
// Check whether these offsets could be moved out of the loop they're in
bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
// Pushes the given add out of the loop
void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex);
// Pushes the given mul out of the loop
void pushOutMul(PHINode *&Phi, Value *IncrementPerRound,
Value *OffsSecondOperand, unsigned LoopIncrement,
IRBuilder<> &Builder);
};
} // end anonymous namespace
@ -205,7 +216,7 @@ int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
return -1;
}
bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
using namespace PatternMatch;
LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
@ -220,7 +231,7 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
Ty->getScalarSizeInBits(), Alignment))
return false;
return nullptr;
lookThroughBitcast(Ptr);
assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
@ -233,7 +244,7 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
if (!Load)
return false;
return nullptr;
if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
@ -247,12 +258,14 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
// If this was an extending gather, we need to get rid of the sext/zext
// sext/zext as well as of the gather itself
I->eraseFromParent();
LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
return true;
return Load;
}
Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I,
Value *Ptr,
IRBuilder<> &Builder) {
using namespace PatternMatch;
Type *Ty = I->getType();
LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
@ -287,7 +300,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
if (!I->hasOneUse())
return nullptr;
// The correct root to replace is the not the CallInst itself, but the
// The correct root to replace is not the CallInst itself, but the
// instruction which extends it
Extend = cast<Instruction>(*I->users().begin());
if (isa<SExtInst>(Extend)) {
@ -334,7 +347,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
}
bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
using namespace PatternMatch;
LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n");
@ -348,7 +361,7 @@ bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
Ty->getScalarSizeInBits(), Alignment))
return false;
return nullptr;
lookThroughBitcast(Ptr);
assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
@ -360,12 +373,12 @@ bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
if (!Store)
Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
if (!Store)
return false;
return nullptr;
LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n");
I->replaceAllUsesWith(Store);
I->eraseFromParent();
return true;
return Store;
}
Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
@ -445,6 +458,263 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
Builder.getInt32(Scale)});
}
void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
Value *OffsSecondOperand,
unsigned StartIndex) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
Instruction *InsertionPoint;
if (isa<Instruction>(OffsSecondOperand))
InsertionPoint = &cast<Instruction>(OffsSecondOperand)->getParent()->back();
else
InsertionPoint =
&cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
// Initialize the phi with a vector that contains a sum of the constants
Instruction *NewIndex = BinaryOperator::Create(
Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
"PushedOutAdd", InsertionPoint);
unsigned IncrementIndex = StartIndex == 0 ? 1 : 0;
// Order such that start index comes first (this reduces mov's)
Phi->addIncoming(NewIndex, Phi->getIncomingBlock(StartIndex));
Phi->addIncoming(Phi->getIncomingValue(IncrementIndex),
Phi->getIncomingBlock(IncrementIndex));
Phi->removeIncomingValue(IncrementIndex);
Phi->removeIncomingValue(StartIndex);
}
void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
Value *IncrementPerRound,
Value *OffsSecondOperand,
unsigned LoopIncrement,
IRBuilder<> &Builder) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n");
// Create a new scalar add outside of the loop and transform it to a splat
// by which loop variable can be incremented
Instruction *InsertionPoint;
if (isa<Instruction>(OffsSecondOperand))
InsertionPoint = &cast<Instruction>(OffsSecondOperand)->getParent()->back();
else
InsertionPoint = &cast<Instruction>(
Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
// Create a new index
Value *StartIndex = BinaryOperator::Create(
Instruction::Mul, Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
OffsSecondOperand, "PushedOutMul", InsertionPoint);
Instruction *Product =
BinaryOperator::Create(Instruction::Mul, IncrementPerRound,
OffsSecondOperand, "Product", InsertionPoint);
// Increment NewIndex by Product instead of the multiplication
Instruction *NewIncrement = BinaryOperator::Create(
Instruction::Add, Phi, Product, "IncrementPushedOutMul",
cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
.getPrevNode());
Phi->addIncoming(StartIndex,
Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
Phi->removeIncomingValue((unsigned)0);
Phi->removeIncomingValue((unsigned)0);
return;
}
// Return true if the given intrinsic is a gather or scatter
bool isGatherScatter(IntrinsicInst *IntInst) {
if (IntInst == nullptr)
return false;
unsigned IntrinsicID = IntInst->getIntrinsicID();
return (IntrinsicID == Intrinsic::masked_gather ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_base ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
IntrinsicID == Intrinsic::masked_scatter ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated);
}
// Check whether all usages of this instruction are as offsets of
// gathers/scatters or simple arithmetics only used by gathers/scatters
bool hasAllGatScatUsers(Instruction *I) {
if (I->hasNUses(0)) {
return false;
}
bool Gatscat = true;
for (User *U : I->users()) {
if (!isa<Instruction>(U))
return false;
if (isa<GetElementPtrInst>(U) ||
isGatherScatter(dyn_cast<IntrinsicInst>(U))) {
return Gatscat;
} else {
unsigned OpCode = cast<Instruction>(U)->getOpcode();
if ((OpCode == Instruction::Add || OpCode == Instruction::Mul) &&
hasAllGatScatUsers(cast<Instruction>(U))) {
continue;
}
return false;
}
}
return Gatscat;
}
bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
LoopInfo *LI) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n");
// Optimise the addresses of gathers/scatters by moving invariant
// calculations out of the loop
if (!isa<Instruction>(Offsets))
return false;
Instruction *Offs = cast<Instruction>(Offsets);
if (Offs->getOpcode() != Instruction::Add &&
Offs->getOpcode() != Instruction::Mul)
return false;
Loop *L = LI->getLoopFor(BB);
if (L == nullptr)
return false;
if (!Offs->hasOneUse()) {
if (!hasAllGatScatUsers(Offs))
return false;
}
// Find out which, if any, operand of the instruction
// is a phi node
PHINode *Phi;
int OffsSecondOp;
if (isa<PHINode>(Offs->getOperand(0))) {
Phi = cast<PHINode>(Offs->getOperand(0));
OffsSecondOp = 1;
} else if (isa<PHINode>(Offs->getOperand(1))) {
Phi = cast<PHINode>(Offs->getOperand(1));
OffsSecondOp = 0;
} else {
bool Changed = true;
if (isa<Instruction>(Offs->getOperand(0)) &&
L->contains(cast<Instruction>(Offs->getOperand(0))))
Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI);
if (isa<Instruction>(Offs->getOperand(1)) &&
L->contains(cast<Instruction>(Offs->getOperand(1))))
Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI);
if (!Changed) {
return false;
} else {
if (isa<PHINode>(Offs->getOperand(0))) {
Phi = cast<PHINode>(Offs->getOperand(0));
OffsSecondOp = 1;
} else if (isa<PHINode>(Offs->getOperand(1))) {
Phi = cast<PHINode>(Offs->getOperand(1));
OffsSecondOp = 0;
} else {
return false;
}
}
}
// A phi node we want to perform this function on should be from the
// loop header, and shouldn't have more than 2 incoming values
if (Phi->getParent() != L->getHeader() ||
Phi->getNumIncomingValues() != 2)
return false;
// The phi must be an induction variable
Instruction *Op;
int IncrementingBlock = -1;
for (int i = 0; i < 2; i++)
if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr)
if (Op->getOpcode() == Instruction::Add &&
(Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
IncrementingBlock = i;
if (IncrementingBlock == -1)
return false;
Instruction *IncInstruction =
cast<Instruction>(Phi->getIncomingValue(IncrementingBlock));
// If the phi is not used by anything else, we can just adapt it when
// replacing the instruction; if it is, we'll have to duplicate it
PHINode *NewPhi;
Value *IncrementPerRound = IncInstruction->getOperand(
(IncInstruction->getOperand(0) == Phi) ? 1 : 0);
// Get the value that is added to/multiplied with the phi
Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
if (IncrementPerRound->getType() != OffsSecondOperand->getType())
// Something has gone wrong, abort
return false;
// Only proceed if the increment per round is a constant or an instruction
// which does not originate from within the loop
if (!isa<Constant>(IncrementPerRound) &&
!(isa<Instruction>(IncrementPerRound) &&
!L->contains(cast<Instruction>(IncrementPerRound))))
return false;
if (Phi->getNumUses() == 2) {
// No other users -> reuse existing phi (One user is the instruction
// we're looking at, the other is the phi increment)
if (IncInstruction->getNumUses() != 1) {
// If the incrementing instruction does have more users than
// our phi, we need to copy it
IncInstruction = BinaryOperator::Create(
Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
IncrementPerRound, "LoopIncrement", IncInstruction);
Phi->setIncomingValue(IncrementingBlock, IncInstruction);
}
NewPhi = Phi;
} else {
// There are other users -> create a new phi
NewPhi = PHINode::Create(Phi->getType(), 0, "NewPhi", Phi);
std::vector<Value *> Increases;
// Copy the incoming values of the old phi
NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1),
Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
IncInstruction = BinaryOperator::Create(
Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
IncrementPerRound, "LoopIncrement", IncInstruction);
NewPhi->addIncoming(IncInstruction,
Phi->getIncomingBlock(IncrementingBlock));
IncrementingBlock = 1;
}
IRBuilder<> Builder(BB->getContext());
Builder.SetInsertPoint(Phi);
Builder.SetCurrentDebugLocation(Offs->getDebugLoc());
switch (Offs->getOpcode()) {
case Instruction::Add:
pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1);
break;
case Instruction::Mul:
pushOutMul(NewPhi, IncrementPerRound, OffsSecondOperand, IncrementingBlock,
Builder);
break;
default:
return false;
}
LLVM_DEBUG(
dbgs() << "masked gathers/scatters: simplified loop variable add/mul\n");
// The instruction has now been "absorbed" into the phi value
Offs->replaceAllUsesWith(NewPhi);
if (Offs->hasNUses(0))
Offs->eraseFromParent();
// Clean up the old increment in case it's unused because we built a new
// one
if (IncInstruction->hasNUses(0))
IncInstruction->eraseFromParent();
return true;
}
bool MVEGatherScatterLowering::runOnFunction(Function &F) {
if (!EnableMaskedGatherScatters)
return false;
@ -455,6 +725,8 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
return false;
SmallVector<IntrinsicInst *, 4> Gathers;
SmallVector<IntrinsicInst *, 4> Scatters;
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
@ -466,10 +738,30 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
}
bool Changed = false;
for (IntrinsicInst *I : Gathers)
Changed |= lowerGather(I);
for (IntrinsicInst *I : Scatters)
Changed |= lowerScatter(I);
for (unsigned i = 0; i < Gathers.size(); i++) {
IntrinsicInst *I = Gathers[i];
if (isa<GetElementPtrInst>(I->getArgOperand(0)))
optimiseOffsets(cast<Instruction>(I->getArgOperand(0))->getOperand(1),
I->getParent(), &LI);
Value *L = lowerGather(I);
if (L == nullptr)
continue;
// Get rid of any now dead instructions
SimplifyInstructionsInBlock(cast<Instruction>(L)->getParent());
Changed = true;
}
for (unsigned i = 0; i < Scatters.size(); i++) {
IntrinsicInst *I = Scatters[i];
if (isa<GetElementPtrInst>(I->getArgOperand(1)))
optimiseOffsets(cast<Instruction>(I->getArgOperand(1))->getOperand(1),
I->getParent(), &LI);
Value *S = lowerScatter(I);
if (S == nullptr)
continue;
// Get rid of any now dead instructions
SimplifyInstructionsInBlock(cast<Instruction>(S)->getParent());
Changed = true;
}
return Changed;
}

View File

@ -7,11 +7,11 @@
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: MVE gather/scatter lowering
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: MVE gather/scatter lowering
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Loop Pass Manager

View File

@ -0,0 +1,190 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt --mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -S -o 2>/dev/null - | FileCheck %s
define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_add_sub_block(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
; CHECK: lower.block:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: br label [[VECTOR_BODY_END]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;
vector.ph:
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
%0 = icmp eq i32 %index, 50
br i1 %0, label %lower.block, label %end
lower.block: ; preds = %vector.body
%1 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
br label %vector.body.end
vector.body.end: ; preds = %lower.block
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_mul_sub_block(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
; CHECK-NEXT: [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
; CHECK: lower.block:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: br label [[VECTOR_BODY_END]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;
vector.ph:
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
%0 = icmp eq i32 %index, 50
br i1 %0, label %lower.block, label %end
lower.block: ; preds = %vector.body
%1 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%2 = add <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
%3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%4 = getelementptr inbounds i32, i32* %dst, i32 %index
%5 = bitcast i32* %4 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %5, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
br label %vector.body.end
vector.body.end: ; preds = %lower.block
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_mul_sub_loop(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: br label [[VECTOR_2_PH:%.*]]
; CHECK: vector.2.ph:
; CHECK-NEXT: br label [[VECTOR_2_BODY:%.*]]
; CHECK: vector.2.body:
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1)
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
; CHECK-NEXT: br label [[VECTOR_2_BODY_END:%.*]]
; CHECK: vector.2.body.end:
; CHECK-NEXT: [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15
; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;
vector.ph:
%ind.end = shl i32 %n.vec, 2
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
br label %vector.2.ph
vector.2.ph:
br label %vector.2.body
vector.2.body: ; preds = %vector.body
%index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
br label %vector.2.body.end
vector.2.body.end: ; preds = %lower.block
%index.2.next = add i32 %index, 4
%5 = icmp eq i32 %index.2.next, 15
br i1 %5, label %vector.body.end, label %vector.2.body
vector.body.end: ; preds = %lower.block
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %end, label %vector.body
end:
ret void;
}
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)

View File

@ -0,0 +1,847 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"min_enum_size", i32 4}
!2 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 26f04d01a39a33d73fd23165c208b215bf5c350d)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"int", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.isvectorized", i32 1}
!9 = distinct !{!9, !10, !8}
!10 = !{!"llvm.loop.unroll.runtime.disable"}
define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_mul_gather:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI0_0
; CHECK-NEXT: vmov.i32 q0, #0x18
; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2]
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 18 @ 0x12
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %dst, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_add_gather:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI1_0
; CHECK-NEXT: vmov.i32 q1, #0x8
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2]
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI1_0:
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 12 @ 0xc
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %dst, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_mul_add_gather:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI2_0
; CHECK-NEXT: vmov.i32 q0, #0x18
; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2]
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI2_0:
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 18 @ 0x12
; CHECK-NEXT: .long 24 @ 0x18
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonly %data,
; CHECK-LABEL: push_out_mul_scatter:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r1, .LCPI3_0
; CHECK-NEXT: vmov.i32 q1, #0x18
; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrw.32 q0, [r0, q2, uxtw #2]
; CHECK-NEXT: vadd.i32 q2, q2, q1
; CHECK-NEXT: bne .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI3_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 18 @ 0x12
i32* noalias nocapture %dst, i32 %n.vec,
<4 x i32> %to.store) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %to.store, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%2 = icmp eq i32 %index.next, %n.vec
br i1 %2, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonly %data,
; CHECK-LABEL: push_out_add_scatter:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r1, .LCPI4_0
; CHECK-NEXT: vmov.i32 q2, #0x8
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
; CHECK-NEXT: vadd.i32 q1, q1, q2
; CHECK-NEXT: bne .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI4_0:
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 12 @ 0xc
i32* noalias nocapture %dst, i32 %n.vec,
<4 x i32> %to.store) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %to.store, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%2 = icmp eq i32 %index.next, %n.vec
br i1 %2, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture readonly %data,
; CHECK-LABEL: push_out_mul_gather_scatter:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r1, .LCPI5_0
; CHECK-NEXT: vmov.i32 q0, #0x18
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: .LBB5_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2]
; CHECK-NEXT: vadd.i32 q3, q1, q0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2]
; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: bne .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI5_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 18 @ 0x12
i32* noalias nocapture %dst, i32 %n.vec) {
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.gather, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%2 = icmp eq i32 %index.next, %n.vec
br i1 %2, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: push_out_add_sub_block:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: adr r3, .LCPI6_0
; CHECK-NEXT: vmov.i32 q1, #0x8
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2]
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI6_0:
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 12 @ 0xc
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
br label %lower.block;
lower.block: ; preds = %vector.body
%0 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
%1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%2 = getelementptr inbounds i32, i32* %dst, i32 %index
%3 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %3, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
br label %vector.body.end
vector.body.end: ; preds = %lower.block
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: non_gatscat_use1:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: adr r3, .LCPI7_0
; CHECK-NEXT: vmov.i32 q1, #0x8
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vmov.i32 q2, #0x6
; CHECK-NEXT: vmov.i32 q3, #0x3
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmul.i32 q4, q0, q3
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q4, q4, q2
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2]
; CHECK-NEXT: vstrb.8 q5, [r1], #16
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI7_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 6 @ 0x6
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: non_gatscat_use2:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vmov.i32 q1, #0x8
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vmov.i32 q2, #0x6
; CHECK-NEXT: vmov.i32 q3, #0x3
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmul.i32 q4, q0, q3
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q4, q4, q2
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2]
; CHECK-NEXT: vstrb.8 q5, [r1], #16
; CHECK-NEXT: bne .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI8_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 6 @ 0x6
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %dst, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
br i1 %5, label %end, label %vector.body
end:
ret void;
}
define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: arm_mat_mult_q31:
; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #48
; CHECK-NEXT: sub sp, #48
; CHECK-NEXT: adr r6, .LCPI9_0
; CHECK-NEXT: ldrd r9, r12, [sp, #144]
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: sub.w r6, r12, #1
; CHECK-NEXT: movs r7, #1
; CHECK-NEXT: vdup.32 q2, r9
; CHECK-NEXT: add.w r6, r7, r6, lsr #1
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: bic r6, r6, #3
; CHECK-NEXT: vmul.i32 q0, q0, r9
; CHECK-NEXT: subs r6, #4
; CHECK-NEXT: vshl.i32 q2, q2, #3
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: vmov.i32 q3, #0x8
; CHECK-NEXT: add.w r4, r7, r6, lsr #2
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: mul r10, r8, r9
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: mul r7, r8, r12
; CHECK-NEXT: vadd.i32 q0, q0, r7
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: vadd.i32 q6, q0, r7
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q1, q7, q3
; CHECK-NEXT: vldrw.u32 q4, [r0, q7, uxtw #2]
; CHECK-NEXT: vldrw.u32 q7, [r1, q6, uxtw #2]
; CHECK-NEXT: vadd.i32 q0, q6, q2
; CHECK-NEXT: vmov q6, q0
; CHECK-NEXT: vmul.i32 q4, q7, q4
; CHECK-NEXT: vmov q7, q1
; CHECK-NEXT: vadd.i32 q5, q4, q5
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
; CHECK-NEXT: add.w r5, r7, r10
; CHECK-NEXT: adds r7, #1
; CHECK-NEXT: vaddv.u32 r6, q5
; CHECK-NEXT: cmp r7, r9
; CHECK-NEXT: str.w r6, [r2, r5, lsl #2]
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: bne .LBB9_1
; CHECK-NEXT: @ %bb.6: @ %for.end25
; CHECK-NEXT: add sp, #48
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.7:
; CHECK-NEXT: .LCPI9_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 6 @ 0x6
for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry
%0 = add i32 %l, -1
%1 = lshr i32 %0, 1
%2 = add nuw i32 %1, 1
%min.iters.check = icmp ult i32 %0, 6
%n.vec = and i32 %2, -4
%ind.end = shl i32 %n.vec, 1
%broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
%broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp.n = icmp eq i32 %2, %n.vec
br label %for.cond8.preheader.us.us.preheader
for.cond8.preheader.us.us.preheader: ; preds = %for.cond8.preheader.us.us.preheader.preheader, %for.cond4.for.cond.cleanup6_crit_edge.us
%i.054.us = phi i32 [ %inc24.us, %for.cond4.for.cond.cleanup6_crit_edge.us ], [ 0, %for.cond8.preheader.us.us.preheader.preheader ]
%mul.us = mul i32 %i.054.us, %l
%mul18.us = mul i32 %i.054.us, %m
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %mul.us, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.ph
for.cond4.for.cond.cleanup6_crit_edge.us: ; preds = %for.cond8.for.cond.cleanup10_crit_edge.us.us
%inc24.us = add nuw nsw i32 %i.054.us, 1
%exitcond85 = icmp eq i32 %inc24.us, %n
br i1 %exitcond85, label %for.end25, label %for.cond8.preheader.us.us.preheader
vector.ph: ; preds = %middle.block, %for.cond8.preheader.us.us.preheader
%j.051.us.us = phi i32 [ %inc.us.us, %middle.block ], [ 0, %for.cond8.preheader.us.us.preheader ]
%broadcast.splatinsert88 = insertelement <4 x i32> undef, i32 %j.051.us.us, i32 0
%broadcast.splat89 = shufflevector <4 x i32> %broadcast.splatinsert88, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
%3 = add <4 x i32> %vec.ind, %broadcast.splat
%4 = getelementptr inbounds i32, i32* %A, <4 x i32> %3
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %4, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa !3
%5 = mul <4 x i32> %vec.ind, %broadcast.splat87
%6 = add <4 x i32> %5, %broadcast.splat89
%7 = getelementptr inbounds i32, i32* %B, <4 x i32> %6
%wide.masked.gather90 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %7, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa !3
%8 = mul nsw <4 x i32> %wide.masked.gather90, %wide.masked.gather
%9 = add <4 x i32> %8, %vec.phi
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%10 = icmp eq i32 %index.next, %n.vec
br i1 %10, label %middle.block, label %vector.body, !llvm.loop !7
middle.block: ; preds = %vector.body
%11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
;for.cond8.for.cond.cleanup10_crit_edge.us.us: ; preds = %for.body11.us.us, %middle.block
%add19.us.us = add i32 %j.051.us.us, %mul18.us
%arrayidx20.us.us = getelementptr inbounds i32, i32* %C, i32 %add19.us.us
store i32 %11, i32* %arrayidx20.us.us, align 4, !tbaa !3
%inc.us.us = add nuw nsw i32 %j.051.us.us, 1
%exitcond = icmp eq i32 %inc.us.us, %m
br i1 %exitcond, label %for.cond4.for.cond.cleanup6_crit_edge.us, label %vector.ph
for.end25: ; preds = %for.cond4.for.cond.cleanup6_crit_edge.us, %entry
ret void
}
define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i16* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: arm_mat_mult_q15:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: itt ne
; CHECK-NEXT: ldrne.w lr, [sp, #104]
; CHECK-NEXT: cmpne.w lr, #0
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
; CHECK-NEXT: ldr.w r11, [sp, #108]
; CHECK-NEXT: mov r6, r1
; CHECK-NEXT: movs r1, #1
; CHECK-NEXT: lsl.w r4, lr, #1
; CHECK-NEXT: bic r0, r11, #3
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: add.w r8, r1, r0, lsr #2
; CHECK-NEXT: lsl.w r0, r11, #1
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: adr r0, .LCPI10_0
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: vmul.i32 q4, q0, lr
; CHECK-NEXT: vdup.32 q0, lr
; CHECK-NEXT: vshl.i32 q5, q0, #2
; CHECK-NEXT: b .LBB10_5
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: mov r1, r4
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: bl __aeabi_memclr
; CHECK-NEXT: ldr.w lr, [sp, #104]
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: add r9, r11
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add r1, r0
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: adds r1, #1
; CHECK-NEXT: cmp r1, r0
; CHECK-NEXT: beq .LBB10_1
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB10_8 Depth 2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: mul r12, r1, lr
; CHECK-NEXT: cmp.w r11, #0
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: b .LBB10_8
; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r3, r10, r12
; CHECK-NEXT: add.w r10, r10, #1
; CHECK-NEXT: cmp r10, lr
; CHECK-NEXT: strh.w r2, [r0, r3, lsl #1]
; CHECK-NEXT: beq .LBB10_4
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: cmp.w r11, #3
; CHECK-NEXT: bhi .LBB10_10
; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB10_13
; CHECK-NEXT: .LBB10_10: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vadd.i32 q1, q4, r10
; CHECK-NEXT: dls lr, r8
; CHECK-NEXT: .LBB10_11: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q2, q1, q5
; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1]
; CHECK-NEXT: vldrh.s32 q1, [r2], #8
; CHECK-NEXT: vmul.i32 q1, q3, q1
; CHECK-NEXT: vadd.i32 q0, q1, q0
; CHECK-NEXT: vmov q1, q2
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vaddv.u32 r2, q0
; CHECK-NEXT: ldr.w lr, [sp, #104]
; CHECK-NEXT: cmp r7, r11
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: mla r3, lr, r7, r10
; CHECK-NEXT: sub.w r5, r11, r7
; CHECK-NEXT: add r7, r9
; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: add.w r7, r0, r7, lsl #1
; CHECK-NEXT: add.w r3, r6, r3, lsl #1
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: ldrsh.w r1, [r3]
; CHECK-NEXT: add r3, r4
; CHECK-NEXT: ldrsh r0, [r7], #2
; CHECK-NEXT: subs r5, #1
; CHECK-NEXT: smlabb r2, r1, r0, r2
; CHECK-NEXT: bne .LBB10_14
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.15:
; CHECK-NEXT: .LCPI10_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp48 = icmp eq i32 %n, 0
br i1 %cmp48, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
for.cond1.preheader.lr.ph: ; preds = %entry
%cmp245 = icmp eq i32 %m, 0
%cmp642 = icmp eq i32 %l, 0
br i1 %cmp245, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph
%0 = shl nuw i32 %m, 1
%min.iters.check = icmp ult i32 %l, 4
%n.vec = and i32 %l, -4
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %m, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp.n = icmp eq i32 %n.vec, %l
br label %for.cond1.preheader.us
for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
%i.049.us = phi i32 [ %inc23.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
%1 = mul i32 %i.049.us, %m
%mul.us = mul i32 %i.049.us, %l
br i1 %cmp642, label %for.cond5.preheader.us73.preheader, label %for.cond5.preheader.us.us
for.cond5.preheader.us73.preheader: ; preds = %for.cond1.preheader.us
%scevgep = getelementptr i16, i16* %C, i32 %1
%scevgep82 = bitcast i16* %scevgep to i8*
call void @llvm.memset.p0i8.i32(i8* align 2 %scevgep82, i8 0, i32 %0, i1 false)
br label %for.cond1.for.cond.cleanup3_crit_edge.us
for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us.us, %for.cond5.preheader.us73.preheader
%inc23.us = add nuw nsw i32 %i.049.us, 1
%exitcond84 = icmp eq i32 %inc23.us, %n
br i1 %exitcond84, label %for.cond.cleanup, label %for.cond1.preheader.us
for.cond5.preheader.us.us: ; preds = %for.cond1.preheader.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us
%j.046.us.us = phi i32 [ %inc20.us.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us ], [ 0, %for.cond1.preheader.us ]
br i1 %min.iters.check, label %for.body8.us.us.preheader, label %vector.ph
for.body8.us.us.preheader: ; preds = %middle.block, %for.cond5.preheader.us.us
%k.044.us.us.ph = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %n.vec, %middle.block ]
%sum.043.us.us.ph = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %13, %middle.block ]
br label %for.body8.us.us
vector.ph: ; preds = %for.cond5.preheader.us.us
%broadcast.splatinsert85 = insertelement <4 x i32> undef, i32 %j.046.us.us, i32 0
%broadcast.splat86 = shufflevector <4 x i32> %broadcast.splatinsert85, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
%2 = add i32 %index, %mul.us
%3 = getelementptr inbounds i16, i16* %A, i32 %2
%4 = bitcast i16* %3 to <4 x i16>*
%wide.load = load <4 x i16>, <4 x i16>* %4, align 2, !tbaa !3
%5 = sext <4 x i16> %wide.load to <4 x i32>
%6 = mul <4 x i32> %vec.ind, %broadcast.splat
%7 = add <4 x i32> %6, %broadcast.splat86
%8 = getelementptr inbounds i16, i16* %B, <4 x i32> %7
%wide.masked.gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %8, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef), !tbaa !3
%9 = sext <4 x i16> %wide.masked.gather to <4 x i32>
%10 = mul nsw <4 x i32> %9, %5
%11 = add <4 x i32> %10, %vec.phi
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
%12 = icmp eq i32 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body, !llvm.loop !7
middle.block: ; preds = %vector.body
%13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %11)
br i1 %cmp.n, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us.preheader
for.cond5.for.cond.cleanup7_crit_edge.us.us: ; preds = %for.body8.us.us, %middle.block
%add14.us.us.lcssa = phi i32 [ %13, %middle.block ], [ %add14.us.us, %for.body8.us.us ]
%conv15.us.us = trunc i32 %add14.us.us.lcssa to i16
%add17.us.us = add i32 %j.046.us.us, %1
%arrayidx18.us.us = getelementptr inbounds i16, i16* %C, i32 %add17.us.us
store i16 %conv15.us.us, i16* %arrayidx18.us.us, align 2, !tbaa !3
%inc20.us.us = add nuw nsw i32 %j.046.us.us, 1
%exitcond83 = icmp eq i32 %inc20.us.us, %m
br i1 %exitcond83, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.cond5.preheader.us.us
for.body8.us.us: ; preds = %for.body8.us.us.preheader, %for.body8.us.us
%k.044.us.us = phi i32 [ %inc.us.us, %for.body8.us.us ], [ %k.044.us.us.ph, %for.body8.us.us.preheader ]
%sum.043.us.us = phi i32 [ %add14.us.us, %for.body8.us.us ], [ %sum.043.us.us.ph, %for.body8.us.us.preheader ]
%add.us.us = add i32 %k.044.us.us, %mul.us
%arrayidx.us.us = getelementptr inbounds i16, i16* %A, i32 %add.us.us
%14 = load i16, i16* %arrayidx.us.us, align 2, !tbaa !3
%conv.us.us = sext i16 %14 to i32
%mul9.us.us = mul i32 %k.044.us.us, %m
%add10.us.us = add i32 %mul9.us.us, %j.046.us.us
%arrayidx11.us.us = getelementptr inbounds i16, i16* %B, i32 %add10.us.us
%15 = load i16, i16* %arrayidx11.us.us, align 2, !tbaa !3
%conv12.us.us = sext i16 %15 to i32
%mul13.us.us = mul nsw i32 %conv12.us.us, %conv.us.us
%add14.us.us = add nsw i32 %mul13.us.us, %sum.043.us.us
%inc.us.us = add nuw nsw i32 %k.044.us.us, 1
%exitcond = icmp eq i32 %inc.us.us, %l
br i1 %exitcond, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us, !llvm.loop !9
for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
ret void
}
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
declare void @llvm.memset.p0i8.i32(i8* align 2, i8, i32, i1)
declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)