diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT index 3339c039ff8..619844256ad 100644 --- a/CODE_OWNERS.TXT +++ b/CODE_OWNERS.TXT @@ -70,7 +70,7 @@ D: Branch weights and BlockFrequencyInfo N: Hal Finkel E: hfinkel@anl.gov -D: BBVectorize, the loop reroller, alias analysis and the PowerPC target +D: The loop reroller, alias analysis and the PowerPC target N: Dan Gohman E: sunfish@mozilla.com diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index 5939805a981..ddb31acfd02 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -54,8 +54,9 @@ Non-comprehensive list of changes in this release its nature as a general purpose PDB manipulation / diagnostics tool that does more than just dumping contents. - -* ... next change ... +* The ``BBVectorize`` pass has been removed. It was fully replaced and no + longer used back in 2014 but we didn't get around to removing it. Now it is + gone. The SLP vectorizer is the suggested non-loop vectorization pass. .. NOTE If you would like to document a larger change, then you can add a @@ -111,7 +112,11 @@ Changes to the OCaml bindings Changes to the C API -------------------- - During this release ... +* Deprecated the ``LLVMAddBBVectorizePass`` interface since the ``BBVectorize`` + pass has been removed. It is now a no-op and will be removed in the next + release. Use ``LLVMAddSLPVectorizePass`` instead to get the supported SLP + vectorizer. + External Open Source Projects Using LLVM 5 ========================================== diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h index a82ef49cb16..cf8306aee76 100644 --- a/include/llvm-c/Transforms/Vectorize.h +++ b/include/llvm-c/Transforms/Vectorize.h @@ -33,7 +33,7 @@ extern "C" { * @{ */ -/** See llvm::createBBVectorizePass function. */ +/** DEPRECATED - Use LLVMAddSLPVectorizePass */ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM); /** See llvm::createLoopVectorizePass function. */ diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index a52fa3b542a..aab14070dbd 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -70,7 +70,6 @@ void initializeAlwaysInlinerLegacyPassPass(PassRegistry&); void initializeArgPromotionPass(PassRegistry&); void initializeAssumptionCacheTrackerPass(PassRegistry&); void initializeAtomicExpandPass(PassRegistry&); -void initializeBBVectorizePass(PassRegistry&); void initializeBDCELegacyPassPass(PassRegistry&); void initializeBarrierNoopPass(PassRegistry&); void initializeBasicAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index c309ddbe2f0..d07c15c1013 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -195,7 +195,6 @@ namespace { (void) llvm::createLoopVectorizePass(); (void) llvm::createSLPVectorizerPass(); (void) llvm::createLoadStoreVectorizerPass(); - (void) llvm::createBBVectorizePass(); (void) llvm::createPartiallyInlineLibCallsPass(); (void) llvm::createScalarizerPass(); (void) llvm::createSeparateConstOffsetFromGEPPass(); diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h index db4bfb15f51..276306f686f 100644 --- a/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -145,7 +145,6 @@ public: bool DisableTailCalls; bool DisableUnitAtATime; bool DisableUnrollLoops; - bool BBVectorize; bool SLPVectorize; bool LoopVectorize; bool RerollLoops; diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h index f734e299c6e..19845e471e4 100644 --- a/include/llvm/Transforms/Vectorize.h +++ b/include/llvm/Transforms/Vectorize.h @@ -106,13 +106,6 @@ struct VectorizeConfig { VectorizeConfig(); }; -//===----------------------------------------------------------------------===// -// -// BBVectorize - A basic-block vectorization pass. -// -BasicBlockPass * -createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig()); - //===----------------------------------------------------------------------===// // // LoopVectorize - Create a loop vectorization pass. diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 5538756b8bf..5b1b58b89c3 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -55,10 +55,6 @@ static cl::opt RunSLPVectorization("vectorize-slp", cl::Hidden, cl::desc("Run the SLP vectorization passes")); -static cl::opt -RunBBVectorization("vectorize-slp-aggressive", cl::Hidden, - cl::desc("Run the BB vectorization passes")); - static cl::opt UseGVNAfterVectorization("use-gvn-after-vectorization", cl::init(false), cl::Hidden, @@ -166,7 +162,6 @@ PassManagerBuilder::PassManagerBuilder() { Inliner = nullptr; DisableUnitAtATime = false; DisableUnrollLoops = false; - BBVectorize = RunBBVectorization; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; RerollLoops = RunLoopRerolling; @@ -384,26 +379,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (RerollLoops) MPM.add(createLoopRerollPass()); - if (!RunSLPAfterLoopVectorization) { - if (SLPVectorize) - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - addInstructionCombiningPass(MPM); - addExtensionsToPM(EP_Peephole, MPM); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(NewGVN - ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass(OptLevel)); - } - } + if (!RunSLPAfterLoopVectorization && SLPVectorize) + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs @@ -635,28 +612,10 @@ void PassManagerBuilder::populateModulePassManager( addInstructionCombiningPass(MPM); } - if (RunSLPAfterLoopVectorization) { - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - if (OptLevel > 1 && ExtraVectorizerPasses) { - MPM.add(createEarlyCSEPass()); - } - } - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - addInstructionCombiningPass(MPM); - addExtensionsToPM(EP_Peephole, MPM); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(NewGVN - ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass(OptLevel)); + if (RunSLPAfterLoopVectorization && SLPVectorize) { + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + if (OptLevel > 1 && ExtraVectorizerPasses) { + MPM.add(createEarlyCSEPass()); } } diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp deleted file mode 100644 index 78453aaa16c..00000000000 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ /dev/null @@ -1,3282 +0,0 @@ -//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a basic-block vectorization pass. The algorithm was -// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, -// et al. It works by looking for chains of pairable operations and then -// pairing them. -// -//===----------------------------------------------------------------------===// - -#define BBV_NAME "bb-vectorize" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/ValueHandle.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Vectorize.h" -#include -using namespace llvm; - -#define DEBUG_TYPE BBV_NAME - -static cl::opt -IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), - cl::Hidden, cl::desc("Ignore target information")); - -static cl::opt -ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, - cl::desc("The required chain depth for vectorization")); - -static cl::opt -UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), - cl::Hidden, cl::desc("Use the chain depth requirement with" - " target information")); - -static cl::opt -SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, - cl::desc("The maximum search distance for instruction pairs")); - -static cl::opt -SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, - cl::desc("Replicating one element to a pair breaks the chain")); - -static cl::opt -VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, - cl::desc("The size of the native vector registers")); - -static cl::opt -MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, - cl::desc("The maximum number of pairing iterations")); - -static cl::opt -Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, - cl::desc("Don't try to form non-2^n-length vectors")); - -static cl::opt -MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, - cl::desc("The maximum number of pairable instructions per group")); - -static cl::opt -MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, - cl::desc("The maximum number of candidate instruction pairs per group")); - -static cl::opt -MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), - cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" - " a full cycle check")); - -static cl::opt -NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize boolean (i1) values")); - -static cl::opt -NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize integer values")); - -static cl::opt -NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize floating-point values")); - -// FIXME: This should default to false once pointer vector support works. -static cl::opt -NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, - cl::desc("Don't try to vectorize pointer values")); - -static cl::opt -NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize casting (conversion) operations")); - -static cl::opt -NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize floating-point math intrinsics")); - -static cl::opt - NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize BitManipulation intrinsics")); - -static cl::opt -NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); - -static cl::opt -NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize select instructions")); - -static cl::opt -NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize comparison instructions")); - -static cl::opt -NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize getelementptr instructions")); - -static cl::opt -NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, - cl::desc("Don't try to vectorize loads and stores")); - -static cl::opt -AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, - cl::desc("Only generate aligned loads and stores")); - -static cl::opt -NoMemOpBoost("bb-vectorize-no-mem-op-boost", - cl::init(false), cl::Hidden, - cl::desc("Don't boost the chain-depth contribution of loads and stores")); - -static cl::opt -FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, - cl::desc("Use a fast instruction dependency analysis")); - -#ifndef NDEBUG -static cl::opt -DebugInstructionExamination("bb-vectorize-debug-instruction-examination", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " instruction-examination process")); -static cl::opt -DebugCandidateSelection("bb-vectorize-debug-candidate-selection", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " candidate-selection process")); -static cl::opt -DebugPairSelection("bb-vectorize-debug-pair-selection", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " pair-selection process")); -static cl::opt -DebugCycleCheck("bb-vectorize-debug-cycle-check", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, output information on the" - " cycle-checking process")); - -static cl::opt -PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", - cl::init(false), cl::Hidden, - cl::desc("When debugging is enabled, dump the basic block after" - " every pair is fused")); -#endif - -STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); - -namespace { - struct BBVectorize : public BasicBlockPass { - static char ID; // Pass identification, replacement for typeid - - const VectorizeConfig Config; - - BBVectorize(const VectorizeConfig &C = VectorizeConfig()) - : BasicBlockPass(ID), Config(C) { - initializeBBVectorizePass(*PassRegistry::getPassRegistry()); - } - - BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) - : BasicBlockPass(ID), Config(C) { - AA = &P->getAnalysis().getAAResults(); - DT = &P->getAnalysis().getDomTree(); - SE = &P->getAnalysis().getSE(); - TLI = &P->getAnalysis().getTLI(); - TTI = IgnoreTargetInfo - ? nullptr - : &P->getAnalysis().getTTI(F); - } - - typedef std::pair ValuePair; - typedef std::pair ValuePairWithCost; - typedef std::pair ValuePairWithDepth; - typedef std::pair VPPair; // A ValuePair pair - typedef std::pair VPPairWithType; - - AliasAnalysis *AA; - DominatorTree *DT; - ScalarEvolution *SE; - const TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - - // FIXME: const correct? - - bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); - - bool getCandidatePairs(BasicBlock &BB, - BasicBlock::iterator &Start, - DenseMap > &CandidatePairs, - DenseSet &FixedOrderPairs, - DenseMap &CandidatePairCostSavings, - std::vector &PairableInsts, bool NonPow2Len); - - // FIXME: The current implementation does not account for pairs that - // are connected in multiple ways. For example: - // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) - enum PairConnectionType { - PairConnectionDirect, - PairConnectionSwap, - PairConnectionSplat - }; - - void computeConnectedPairs( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseMap &PairConnectionTypes); - - void buildDepMap(BasicBlock &BB, - DenseMap > &CandidatePairs, - std::vector &PairableInsts, - DenseSet &PairableInstUsers); - - void choosePairs(DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - DenseMap &CandidatePairCostSavings, - std::vector &PairableInsts, - DenseSet &FixedOrderPairs, - DenseMap &PairConnectionTypes, - DenseMap > &ConnectedPairs, - DenseMap > &ConnectedPairDeps, - DenseSet &PairableInstUsers, - DenseMap& ChosenPairs); - - void fuseChosenPairs(BasicBlock &BB, - std::vector &PairableInsts, - DenseMap& ChosenPairs, - DenseSet &FixedOrderPairs, - DenseMap &PairConnectionTypes, - DenseMap > &ConnectedPairs, - DenseMap > &ConnectedPairDeps); - - - bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); - - bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len, - int &CostSavings, int &FixedOrder); - - bool trackUsesOfI(DenseSet &Users, - AliasSetTracker &WriteSet, Instruction *I, - Instruction *J, bool UpdateUsers = true, - DenseSet *LoadMoveSetPairs = nullptr); - - void computePairsConnectedTo( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseMap &PairConnectionTypes, - ValuePair P); - - bool pairsConflict(ValuePair P, ValuePair Q, - DenseSet &PairableInstUsers, - DenseMap > - *PairableInstUserMap = nullptr, - DenseSet *PairableInstUserPairSet = nullptr); - - bool pairWillFormCycle(ValuePair P, - DenseMap > &PairableInstUsers, - DenseSet &CurrentPairs); - - void pruneDAGFor( - DenseMap > &CandidatePairs, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseSet &PairableInstUsers, - DenseMap > &PairableInstUserMap, - DenseSet &PairableInstUserPairSet, - DenseMap &ChosenPairs, - DenseMap &DAG, - DenseSet &PrunedDAG, ValuePair J, - bool UseCycleCheck); - - void buildInitialDAGFor( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseSet &PairableInstUsers, - DenseMap &ChosenPairs, - DenseMap &DAG, ValuePair J); - - void findBestDAGFor( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - DenseMap &CandidatePairCostSavings, - std::vector &PairableInsts, - DenseSet &FixedOrderPairs, - DenseMap &PairConnectionTypes, - DenseMap > &ConnectedPairs, - DenseMap > &ConnectedPairDeps, - DenseSet &PairableInstUsers, - DenseMap > &PairableInstUserMap, - DenseSet &PairableInstUserPairSet, - DenseMap &ChosenPairs, - DenseSet &BestDAG, size_t &BestMaxDepth, - int &BestEffSize, Value *II, std::vector&JJ, - bool UseCycleCheck); - - Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o); - - void fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned MaskOffset, unsigned NumInElem, - unsigned NumInElem1, unsigned IdxOffset, - std::vector &Mask); - - Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, - Instruction *J); - - bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, - unsigned o, Value *&LOp, unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, - unsigned IdxOff = 0); - - Value *getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool IBeforeJ); - - void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, - Instruction *J, SmallVectorImpl &ReplacedOperands, - bool IBeforeJ); - - void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, - Instruction *J, Instruction *K, - Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2); - - void collectPairLoadMoveSet(BasicBlock &BB, - DenseMap &ChosenPairs, - DenseMap > &LoadMoveSet, - DenseSet &LoadMoveSetPairs, - Instruction *I); - - void collectLoadMoveSet(BasicBlock &BB, - std::vector &PairableInsts, - DenseMap &ChosenPairs, - DenseMap > &LoadMoveSet, - DenseSet &LoadMoveSetPairs); - - bool canMoveUsesOfIAfterJ(BasicBlock &BB, - DenseSet &LoadMoveSetPairs, - Instruction *I, Instruction *J); - - void moveUsesOfIAfterJ(BasicBlock &BB, - DenseSet &LoadMoveSetPairs, - Instruction *&InsertionPt, - Instruction *I, Instruction *J); - - bool vectorizeBB(BasicBlock &BB) { - if (skipBasicBlock(BB)) - return false; - if (!DT->isReachableFromEntry(&BB)) { - DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << - " in " << BB.getParent()->getName() << "\n"); - return false; - } - - DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); - - bool changed = false; - // Iterate a sufficient number of times to merge types of size 1 bit, - // then 2 bits, then 4, etc. up to half of the target vector width of the - // target vector register. - unsigned n = 1; - for (unsigned v = 2; - (TTI || v <= Config.VectorBits) && - (!Config.MaxIter || n <= Config.MaxIter); - v *= 2, ++n) { - DEBUG(dbgs() << "BBV: fusing loop #" << n << - " for " << BB.getName() << " in " << - BB.getParent()->getName() << "...\n"); - if (vectorizePairs(BB)) - changed = true; - else - break; - } - - if (changed && !Pow2LenOnly) { - ++n; - for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { - DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << - n << " for " << BB.getName() << " in " << - BB.getParent()->getName() << "...\n"); - if (!vectorizePairs(BB, true)) break; - } - } - - DEBUG(dbgs() << "BBV: done!\n"); - return changed; - } - - bool runOnBasicBlock(BasicBlock &BB) override { - // OptimizeNone check deferred to vectorizeBB(). - - AA = &getAnalysis().getAAResults(); - DT = &getAnalysis().getDomTree(); - SE = &getAnalysis().getSE(); - TLI = &getAnalysis().getTLI(); - TTI = IgnoreTargetInfo - ? nullptr - : &getAnalysis().getTTI( - *BB.getParent()); - - return vectorizeBB(BB); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - BasicBlockPass::getAnalysisUsage(AU); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.setPreservesCFG(); - } - - static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { - assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && - "Cannot form vector from incompatible scalar types"); - Type *STy = ElemTy->getScalarType(); - - unsigned numElem; - if (VectorType *VTy = dyn_cast(ElemTy)) { - numElem = VTy->getNumElements(); - } else { - numElem = 1; - } - - if (VectorType *VTy = dyn_cast(Elem2Ty)) { - numElem += VTy->getNumElements(); - } else { - numElem += 1; - } - - return VectorType::get(STy, numElem); - } - - static inline void getInstructionTypes(Instruction *I, - Type *&T1, Type *&T2) { - if (StoreInst *SI = dyn_cast(I)) { - // For stores, it is the value type, not the pointer type that matters - // because the value is what will come from a vector register. - - Value *IVal = SI->getValueOperand(); - T1 = IVal->getType(); - } else { - T1 = I->getType(); - } - - if (CastInst *CI = dyn_cast(I)) - T2 = CI->getSrcTy(); - else - T2 = T1; - - if (SelectInst *SI = dyn_cast(I)) { - T2 = SI->getCondition()->getType(); - } else if (ShuffleVectorInst *SI = dyn_cast(I)) { - T2 = SI->getOperand(0)->getType(); - } else if (CmpInst *CI = dyn_cast(I)) { - T2 = CI->getOperand(0)->getType(); - } - } - - // Returns the weight associated with the provided value. A chain of - // candidate pairs has a length given by the sum of the weights of its - // members (one weight per pair; the weight of each member of the pair - // is assumed to be the same). This length is then compared to the - // chain-length threshold to determine if a given chain is significant - // enough to be vectorized. The length is also used in comparing - // candidate chains where longer chains are considered to be better. - // Note: when this function returns 0, the resulting instructions are - // not actually fused. - inline size_t getDepthFactor(Value *V) { - // InsertElement and ExtractElement have a depth factor of zero. This is - // for two reasons: First, they cannot be usefully fused. Second, because - // the pass generates a lot of these, they can confuse the simple metric - // used to compare the dags in the next iteration. Thus, giving them a - // weight of zero allows the pass to essentially ignore them in - // subsequent iterations when looking for vectorization opportunities - // while still tracking dependency chains that flow through those - // instructions. - if (isa(V) || isa(V)) - return 0; - - // Give a load or store half of the required depth so that load/store - // pairs will vectorize. - if (!Config.NoMemOpBoost && (isa(V) || isa(V))) - return Config.ReqChainDepth/2; - - return 1; - } - - // Returns the cost of the provided instruction using TTI. - // This does not handle loads and stores. - unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue, - const Instruction *I = nullptr) { - switch (Opcode) { - default: break; - case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the instruction addressing mode. At the moment we don't - // generate vector GEPs. - return 0; - case Instruction::Br: - return TTI->getCFInstrCost(Opcode); - case Instruction::PHI: - return 0; - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - return TTI->getCmpSelInstrCost(Opcode, T1, T2, I); - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::ShuffleVector: - return TTI->getCastInstrCost(Opcode, T1, T2, I); - } - - return 1; - } - - // This determines the relative offset of two loads or stores, returning - // true if the offset could be determined to be some constant value. - // For example, if OffsetInElmts == 1, then J accesses the memory directly - // after I; if OffsetInElmts == -1 then I accesses the memory - // directly after J. - bool getPairPtrInfo(Instruction *I, Instruction *J, - Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, - unsigned &IAddressSpace, unsigned &JAddressSpace, - int64_t &OffsetInElmts, bool ComputeOffset = true) { - OffsetInElmts = 0; - if (LoadInst *LI = dyn_cast(I)) { - LoadInst *LJ = cast(J); - IPtr = LI->getPointerOperand(); - JPtr = LJ->getPointerOperand(); - IAlignment = LI->getAlignment(); - JAlignment = LJ->getAlignment(); - IAddressSpace = LI->getPointerAddressSpace(); - JAddressSpace = LJ->getPointerAddressSpace(); - } else { - StoreInst *SI = cast(I), *SJ = cast(J); - IPtr = SI->getPointerOperand(); - JPtr = SJ->getPointerOperand(); - IAlignment = SI->getAlignment(); - JAlignment = SJ->getAlignment(); - IAddressSpace = SI->getPointerAddressSpace(); - JAddressSpace = SJ->getPointerAddressSpace(); - } - - if (!ComputeOffset) - return true; - - const SCEV *IPtrSCEV = SE->getSCEV(IPtr); - const SCEV *JPtrSCEV = SE->getSCEV(JPtr); - - // If this is a trivial offset, then we'll get something like - // 1*sizeof(type). With target data, which we need anyway, this will get - // constant folded into a number. - const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); - if (const SCEVConstant *ConstOffSCEV = - dyn_cast(OffsetSCEV)) { - ConstantInt *IntOff = ConstOffSCEV->getValue(); - int64_t Offset = IntOff->getSExtValue(); - const DataLayout &DL = I->getModule()->getDataLayout(); - Type *VTy = IPtr->getType()->getPointerElementType(); - int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy); - - Type *VTy2 = JPtr->getType()->getPointerElementType(); - if (VTy != VTy2 && Offset < 0) { - int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2); - OffsetInElmts = Offset/VTy2TSS; - return (std::abs(Offset) % VTy2TSS) == 0; - } - - OffsetInElmts = Offset/VTyTSS; - return (std::abs(Offset) % VTyTSS) == 0; - } - - return false; - } - - // Returns true if the provided CallInst represents an intrinsic that can - // be vectorized. - bool isVectorizableIntrinsic(CallInst* I) { - Function *F = I->getCalledFunction(); - if (!F) return false; - - Intrinsic::ID IID = F->getIntrinsicID(); - if (!IID) return false; - - switch(IID) { - default: - return false; - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::log: - case Intrinsic::log2: - case Intrinsic::log10: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::pow: - case Intrinsic::round: - case Intrinsic::copysign: - case Intrinsic::ceil: - case Intrinsic::nearbyint: - case Intrinsic::rint: - case Intrinsic::trunc: - case Intrinsic::floor: - case Intrinsic::fabs: - case Intrinsic::minnum: - case Intrinsic::maxnum: - return Config.VectorizeMath; - case Intrinsic::bswap: - case Intrinsic::ctpop: - case Intrinsic::ctlz: - case Intrinsic::cttz: - return Config.VectorizeBitManipulations; - case Intrinsic::fma: - case Intrinsic::fmuladd: - return Config.VectorizeFMA; - } - } - - bool isPureIEChain(InsertElementInst *IE) { - InsertElementInst *IENext = IE; - do { - if (!isa(IENext->getOperand(0)) && - !isa(IENext->getOperand(0))) { - return false; - } - } while ((IENext = - dyn_cast(IENext->getOperand(0)))); - - return true; - } - }; - - // This function implements one vectorization iteration on the provided - // basic block. It returns true if the block is changed. - bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { - bool ShouldContinue; - BasicBlock::iterator Start = BB.getFirstInsertionPt(); - - std::vector AllPairableInsts; - DenseMap AllChosenPairs; - DenseSet AllFixedOrderPairs; - DenseMap AllPairConnectionTypes; - DenseMap > AllConnectedPairs, - AllConnectedPairDeps; - - do { - std::vector PairableInsts; - DenseMap > CandidatePairs; - DenseSet FixedOrderPairs; - DenseMap CandidatePairCostSavings; - ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, - FixedOrderPairs, - CandidatePairCostSavings, - PairableInsts, NonPow2Len); - if (PairableInsts.empty()) continue; - - // Build the candidate pair set for faster lookups. - DenseSet CandidatePairsSet; - for (DenseMap >::iterator I = - CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) - for (std::vector::iterator J = I->second.begin(), - JE = I->second.end(); J != JE; ++J) - CandidatePairsSet.insert(ValuePair(I->first, *J)); - - // Now we have a map of all of the pairable instructions and we need to - // select the best possible pairing. A good pairing is one such that the - // users of the pair are also paired. This defines a (directed) forest - // over the pairs such that two pairs are connected iff the second pair - // uses the first. - - // Note that it only matters that both members of the second pair use some - // element of the first pair (to allow for splatting). - - DenseMap > ConnectedPairs, - ConnectedPairDeps; - DenseMap PairConnectionTypes; - computeConnectedPairs(CandidatePairs, CandidatePairsSet, - PairableInsts, ConnectedPairs, PairConnectionTypes); - if (ConnectedPairs.empty()) continue; - - for (DenseMap >::iterator - I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); - I != IE; ++I) - for (std::vector::iterator J = I->second.begin(), - JE = I->second.end(); J != JE; ++J) - ConnectedPairDeps[*J].push_back(I->first); - - // Build the pairable-instruction dependency map - DenseSet PairableInstUsers; - buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); - - // There is now a graph of the connected pairs. For each variable, pick - // the pairing with the largest dag meeting the depth requirement on at - // least one branch. Then select all pairings that are part of that dag - // and remove them from the list of available pairings and pairable - // variables. - - DenseMap ChosenPairs; - choosePairs(CandidatePairs, CandidatePairsSet, - CandidatePairCostSavings, - PairableInsts, FixedOrderPairs, PairConnectionTypes, - ConnectedPairs, ConnectedPairDeps, - PairableInstUsers, ChosenPairs); - - if (ChosenPairs.empty()) continue; - AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), - PairableInsts.end()); - AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); - - // Only for the chosen pairs, propagate information on fixed-order pairs, - // pair connections, and their types to the data structures used by the - // pair fusion procedures. - for (DenseMap::iterator I = ChosenPairs.begin(), - IE = ChosenPairs.end(); I != IE; ++I) { - if (FixedOrderPairs.count(*I)) - AllFixedOrderPairs.insert(*I); - else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) - AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); - - for (DenseMap::iterator J = ChosenPairs.begin(); - J != IE; ++J) { - DenseMap::iterator K = - PairConnectionTypes.find(VPPair(*I, *J)); - if (K != PairConnectionTypes.end()) { - AllPairConnectionTypes.insert(*K); - } else { - K = PairConnectionTypes.find(VPPair(*J, *I)); - if (K != PairConnectionTypes.end()) - AllPairConnectionTypes.insert(*K); - } - } - } - - for (DenseMap >::iterator - I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); - I != IE; ++I) - for (std::vector::iterator J = I->second.begin(), - JE = I->second.end(); J != JE; ++J) - if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { - AllConnectedPairs[I->first].push_back(*J); - AllConnectedPairDeps[*J].push_back(I->first); - } - } while (ShouldContinue); - - if (AllChosenPairs.empty()) return false; - NumFusedOps += AllChosenPairs.size(); - - // A set of pairs has now been selected. It is now necessary to replace the - // paired instructions with vector instructions. For this procedure each - // operand must be replaced with a vector operand. This vector is formed - // by using build_vector on the old operands. The replaced values are then - // replaced with a vector_extract on the result. Subsequent optimization - // passes should coalesce the build/extract combinations. - - fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, - AllPairConnectionTypes, - AllConnectedPairs, AllConnectedPairDeps); - - // It is important to cleanup here so that future iterations of this - // function have less work to do. - (void)SimplifyInstructionsInBlock(&BB, TLI); - return true; - } - - // This function returns true if the provided instruction is capable of being - // fused into a vector instruction. This determination is based only on the - // type and other attributes of the instruction. - bool BBVectorize::isInstVectorizable(Instruction *I, - bool &IsSimpleLoadStore) { - IsSimpleLoadStore = false; - - if (CallInst *C = dyn_cast(I)) { - if (!isVectorizableIntrinsic(C)) - return false; - } else if (LoadInst *L = dyn_cast(I)) { - // Vectorize simple loads if possbile: - IsSimpleLoadStore = L->isSimple(); - if (!IsSimpleLoadStore || !Config.VectorizeMemOps) - return false; - } else if (StoreInst *S = dyn_cast(I)) { - // Vectorize simple stores if possbile: - IsSimpleLoadStore = S->isSimple(); - if (!IsSimpleLoadStore || !Config.VectorizeMemOps) - return false; - } else if (CastInst *C = dyn_cast(I)) { - // We can vectorize casts, but not casts of pointer types, etc. - if (!Config.VectorizeCasts) - return false; - - Type *SrcTy = C->getSrcTy(); - if (!SrcTy->isSingleValueType()) - return false; - - Type *DestTy = C->getDestTy(); - if (!DestTy->isSingleValueType()) - return false; - } else if (SelectInst *SI = dyn_cast(I)) { - if (!Config.VectorizeSelect) - return false; - // We can vectorize a select if either all operands are scalars, - // or all operands are vectors. Trying to "widen" a select between - // vectors that has a scalar condition results in a malformed select. - // FIXME: We could probably be smarter about this by rewriting the select - // with different types instead. - return (SI->getCondition()->getType()->isVectorTy() == - SI->getTrueValue()->getType()->isVectorTy()); - } else if (isa(I)) { - if (!Config.VectorizeCmp) - return false; - } else if (GetElementPtrInst *G = dyn_cast(I)) { - if (!Config.VectorizeGEP) - return false; - - // Currently, vector GEPs exist only with one index. - if (G->getNumIndices() != 1) - return false; - } else if (!(I->isBinaryOp() || isa(I) || - isa(I) || isa(I))) { - return false; - } - - Type *T1, *T2; - getInstructionTypes(I, T1, T2); - - // Not every type can be vectorized... - if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || - !(VectorType::isValidElementType(T2) || T2->isVectorTy())) - return false; - - if (T1->getScalarSizeInBits() == 1) { - if (!Config.VectorizeBools) - return false; - } else { - if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) - return false; - } - - if (T2->getScalarSizeInBits() == 1) { - if (!Config.VectorizeBools) - return false; - } else { - if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) - return false; - } - - if (!Config.VectorizeFloats - && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) - return false; - - // Don't vectorize target-specific types. - if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) - return false; - if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) - return false; - - if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() || - T2->getScalarType()->isPointerTy())) - return false; - - if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits)) - return false; - - return true; - } - - // This function returns true if the two provided instructions are compatible - // (meaning that they can be fused into a vector instruction). This assumes - // that I has already been determined to be vectorizable and that J is not - // in the use dag of I. - bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len, - int &CostSavings, int &FixedOrder) { - DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << - " <-> " << *J << "\n"); - - CostSavings = 0; - FixedOrder = 0; - - // Loads and stores can be merged if they have different alignments, - // but are otherwise the same. - if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | - (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) - return false; - - Type *IT1, *IT2, *JT1, *JT2; - getInstructionTypes(I, IT1, IT2); - getInstructionTypes(J, JT1, JT2); - unsigned MaxTypeBits = std::max( - IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), - IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (!TTI && MaxTypeBits > Config.VectorBits) - return false; - - // FIXME: handle addsub-type operations! - - if (IsSimpleLoadStore) { - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; - int64_t OffsetInElmts = 0; - if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - IAddressSpace, JAddressSpace, OffsetInElmts) && - std::abs(OffsetInElmts) == 1) { - FixedOrder = (int) OffsetInElmts; - unsigned BottomAlignment = IAlignment; - if (OffsetInElmts < 0) BottomAlignment = JAlignment; - - Type *aTypeI = isa(I) ? - cast(I)->getValueOperand()->getType() : I->getType(); - Type *aTypeJ = isa(J) ? - cast(J)->getValueOperand()->getType() : J->getType(); - Type *VType = getVecTypeForPair(aTypeI, aTypeJ); - - if (Config.AlignedOnly) { - // An aligned load or store is possible only if the instruction - // with the lower offset has an alignment suitable for the - // vector type. - const DataLayout &DL = I->getModule()->getDataLayout(); - unsigned VecAlignment = DL.getPrefTypeAlignment(VType); - if (BottomAlignment < VecAlignment) - return false; - } - - if (TTI) { - unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, - IAlignment, IAddressSpace); - unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, - JAlignment, JAddressSpace); - unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, - BottomAlignment, - IAddressSpace); - - ICost += TTI->getAddressComputationCost(aTypeI); - JCost += TTI->getAddressComputationCost(aTypeJ); - VCost += TTI->getAddressComputationCost(VType); - - if (VCost > ICost + JCost) - return false; - - // We don't want to fuse to a type that will be split, even - // if the two input types will also be split and there is no other - // associated cost. - unsigned VParts = TTI->getNumberOfParts(VType); - if (VParts > 1) - return false; - else if (!VParts && VCost == ICost + JCost) - return false; - - CostSavings = ICost + JCost - VCost; - } - } else { - return false; - } - } else if (TTI) { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; - unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I); - unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J); - Type *VT1 = getVecTypeForPair(IT1, JT1), - *VT2 = getVecTypeForPair(IT2, JT2); - - // On some targets (example X86) the cost of a vector shift may vary - // depending on whether the second operand is a Uniform or - // NonUniform Constant. - switch (I->getOpcode()) { - default : break; - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - - // If both I and J are scalar shifts by constant, then the - // merged vector shift count would be either a constant splat value - // or a non-uniform vector of constants. - if (ConstantInt *CII = dyn_cast(I->getOperand(1))) { - if (ConstantInt *CIJ = dyn_cast(J->getOperand(1))) - Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : - TargetTransformInfo::OK_NonUniformConstantValue; - } else { - // Check for a splat of a constant or for a non uniform vector - // of constants. - Value *IOp = I->getOperand(1); - Value *JOp = J->getOperand(1); - if ((isa(IOp) || isa(IOp)) && - (isa(JOp) || isa(JOp))) { - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - Constant *SplatValue = cast(IOp)->getSplatValue(); - if (SplatValue != nullptr && - SplatValue == cast(JOp)->getSplatValue()) - Op2VK = TargetTransformInfo::OK_UniformConstantValue; - } - } - } - - // Note that this procedure is incorrect for insert and extract element - // instructions (because combining these often results in a shuffle), - // but this cost is ignored (because insert and extract element - // instructions are assigned a zero depth factor and are not really - // fused in general). - unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I); - - if (VCost > ICost + JCost) - return false; - - // We don't want to fuse to a type that will be split, even - // if the two input types will also be split and there is no other - // associated cost. - unsigned VParts1 = TTI->getNumberOfParts(VT1), - VParts2 = TTI->getNumberOfParts(VT2); - if (VParts1 > 1 || VParts2 > 1) - return false; - else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) - return false; - - CostSavings = ICost + JCost - VCost; - } - - // The powi,ctlz,cttz intrinsics are special because only the first - // argument is vectorized, the second arguments must be equal. - CallInst *CI = dyn_cast(I); - Function *FI; - if (CI && (FI = CI->getCalledFunction())) { - Intrinsic::ID IID = FI->getIntrinsicID(); - if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) { - Value *A1I = CI->getArgOperand(1), - *A1J = cast(J)->getArgOperand(1); - const SCEV *A1ISCEV = SE->getSCEV(A1I), - *A1JSCEV = SE->getSCEV(A1J); - return (A1ISCEV == A1JSCEV); - } - - if (IID && TTI) { - FastMathFlags FMFCI; - if (auto *FPMOCI = dyn_cast(CI)) - FMFCI = FPMOCI->getFastMathFlags(); - SmallVector IArgs(CI->arg_operands()); - unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI); - - CallInst *CJ = cast(J); - - FastMathFlags FMFCJ; - if (auto *FPMOCJ = dyn_cast(CJ)) - FMFCJ = FPMOCJ->getFastMathFlags(); - - SmallVector JArgs(CJ->arg_operands()); - unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ); - - assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && - "Intrinsic argument counts differ"); - SmallVector Tys; - SmallVector VecArgs; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && i == 1) { - Tys.push_back(CI->getArgOperand(i)->getType()); - VecArgs.push_back(CI->getArgOperand(i)); - } - else { - Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), - CJ->getArgOperand(i)->getType())); - // Add both operands, and then count their scalarization overhead - // with VF 1. - VecArgs.push_back(CI->getArgOperand(i)); - VecArgs.push_back(CJ->getArgOperand(i)); - } - } - - // Compute the scalarization cost here with the original operands (to - // check for uniqueness etc), and then call getIntrinsicInstrCost() - // with the constructed vector types. - Type *RetTy = getVecTypeForPair(IT1, JT1); - unsigned ScalarizationCost = 0; - if (!RetTy->isVoidTy()) - ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false); - ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1); - - FastMathFlags FMFV = FMFCI; - FMFV &= FMFCJ; - unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV, - ScalarizationCost); - - if (VCost > ICost + JCost) - return false; - - // We don't want to fuse to a type that will be split, even - // if the two input types will also be split and there is no other - // associated cost. - unsigned RetParts = TTI->getNumberOfParts(RetTy); - if (RetParts > 1) - return false; - else if (!RetParts && VCost == ICost + JCost) - return false; - - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (!Tys[i]->isVectorTy()) - continue; - - unsigned NumParts = TTI->getNumberOfParts(Tys[i]); - if (NumParts > 1) - return false; - else if (!NumParts && VCost == ICost + JCost) - return false; - } - - CostSavings = ICost + JCost - VCost; - } - } - - return true; - } - - // Figure out whether or not J uses I and update the users and write-set - // structures associated with I. Specifically, Users represents the set of - // instructions that depend on I. WriteSet represents the set - // of memory locations that are dependent on I. If UpdateUsers is true, - // and J uses I, then Users is updated to contain J and WriteSet is updated - // to contain any memory locations to which J writes. The function returns - // true if J uses I. By default, alias analysis is used to determine - // whether J reads from memory that overlaps with a location in WriteSet. - // If LoadMoveSet is not null, then it is a previously-computed map - // where the key is the memory-based user instruction and the value is - // the instruction to be compared with I. So, if LoadMoveSet is provided, - // then the alias analysis is not used. This is necessary because this - // function is called during the process of moving instructions during - // vectorization and the results of the alias analysis are not stable during - // that process. - bool BBVectorize::trackUsesOfI(DenseSet &Users, - AliasSetTracker &WriteSet, Instruction *I, - Instruction *J, bool UpdateUsers, - DenseSet *LoadMoveSetPairs) { - bool UsesI = false; - - // This instruction may already be marked as a user due, for example, to - // being a member of a selected pair. - if (Users.count(J)) - UsesI = true; - - if (!UsesI) - for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); - JU != JE; ++JU) { - Value *V = *JU; - if (I == V || Users.count(V)) { - UsesI = true; - break; - } - } - if (!UsesI && J->mayReadFromMemory()) { - if (LoadMoveSetPairs) { - UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); - } else { - for (AliasSetTracker::iterator W = WriteSet.begin(), - WE = WriteSet.end(); W != WE; ++W) { - if (W->aliasesUnknownInst(J, *AA)) { - UsesI = true; - break; - } - } - } - } - - if (UsesI && UpdateUsers) { - if (J->mayWriteToMemory()) WriteSet.add(J); - Users.insert(J); - } - - return UsesI; - } - - // This function iterates over all instruction pairs in the provided - // basic block and collects all candidate pairs for vectorization. - bool BBVectorize::getCandidatePairs(BasicBlock &BB, - BasicBlock::iterator &Start, - DenseMap > &CandidatePairs, - DenseSet &FixedOrderPairs, - DenseMap &CandidatePairCostSavings, - std::vector &PairableInsts, bool NonPow2Len) { - size_t TotalPairs = 0; - BasicBlock::iterator E = BB.end(); - if (Start == E) return false; - - bool ShouldContinue = false, IAfterStart = false; - for (BasicBlock::iterator I = Start++; I != E; ++I) { - if (I == Start) IAfterStart = true; - - bool IsSimpleLoadStore; - if (!isInstVectorizable(&*I, IsSimpleLoadStore)) - continue; - - // Look for an instruction with which to pair instruction *I... - DenseSet Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) - WriteSet.add(&*I); - - bool JAfterStart = IAfterStart; - BasicBlock::iterator J = std::next(I); - for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { - if (J == Start) - JAfterStart = true; - - // Determine if J uses I, if so, exit the loop. - bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep); - if (Config.FastDep) { - // Note: For this heuristic to be effective, independent operations - // must tend to be intermixed. This is likely to be true from some - // kinds of grouped loop unrolling (but not the generic LLVM pass), - // but otherwise may require some kind of reordering pass. - - // When using fast dependency analysis, - // stop searching after first use: - if (UsesI) break; - } else { - if (UsesI) continue; - } - - // J does not use I, and comes before the first use of I, so it can be - // merged with I if the instructions are compatible. - int CostSavings, FixedOrder; - if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, - CostSavings, FixedOrder)) - continue; - - // J is a candidate for merging with I. - if (PairableInsts.empty() || - PairableInsts[PairableInsts.size() - 1] != &*I) { - PairableInsts.push_back(&*I); - } - - CandidatePairs[&*I].push_back(&*J); - ++TotalPairs; - if (TTI) - CandidatePairCostSavings.insert( - ValuePairWithCost(ValuePair(&*I, &*J), CostSavings)); - - if (FixedOrder == 1) - FixedOrderPairs.insert(ValuePair(&*I, &*J)); - else if (FixedOrder == -1) - FixedOrderPairs.insert(ValuePair(&*J, &*I)); - - // The next call to this function must start after the last instruction - // selected during this invocation. - if (JAfterStart) { - Start = std::next(J); - IAfterStart = JAfterStart = false; - } - - DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " - << *I << " <-> " << *J << " (cost savings: " << - CostSavings << ")\n"); - - // If we have already found too many pairs, break here and this function - // will be called again starting after the last instruction selected - // during this invocation. - if (PairableInsts.size() >= Config.MaxInsts || - TotalPairs >= Config.MaxPairs) { - ShouldContinue = true; - break; - } - } - - if (ShouldContinue) - break; - } - - DEBUG(dbgs() << "BBV: found " << PairableInsts.size() - << " instructions with candidate pairs\n"); - - return ShouldContinue; - } - - // Finds candidate pairs connected to the pair P = . This means that - // it looks for pairs such that both members have an input which is an - // output of PI or PJ. - void BBVectorize::computePairsConnectedTo( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseMap &PairConnectionTypes, - ValuePair P) { - StoreInst *SI, *SJ; - - // For each possible pairing for this variable, look at the uses of - // the first value... - for (Value::user_iterator I = P.first->user_begin(), - E = P.first->user_end(); - I != E; ++I) { - User *UI = *I; - if (isa(UI)) { - // A pair cannot be connected to a load because the load only takes one - // operand (the address) and it is a scalar even after vectorization. - continue; - } else if ((SI = dyn_cast(UI)) && - P.first == SI->getPointerOperand()) { - // Similarly, a pair cannot be connected to a store through its - // pointer operand. - continue; - } - - // For each use of the first variable, look for uses of the second - // variable... - for (User *UJ : P.second->users()) { - if ((SJ = dyn_cast(UJ)) && - P.second == SJ->getPointerOperand()) - continue; - - // Look for : - if (CandidatePairsSet.count(ValuePair(UI, UJ))) { - VPPair VP(P, ValuePair(UI, UJ)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); - } - - // Look for : - if (CandidatePairsSet.count(ValuePair(UJ, UI))) { - VPPair VP(P, ValuePair(UJ, UI)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); - } - } - - if (Config.SplatBreaksChain) continue; - // Look for cases where just the first value in the pair is used by - // both members of another pair (splatting). - for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) { - User *UJ = *J; - if ((SJ = dyn_cast(UJ)) && - P.first == SJ->getPointerOperand()) - continue; - - if (CandidatePairsSet.count(ValuePair(UI, UJ))) { - VPPair VP(P, ValuePair(UI, UJ)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); - } - } - } - - if (Config.SplatBreaksChain) return; - // Look for cases where just the second value in the pair is used by - // both members of another pair (splatting). - for (Value::user_iterator I = P.second->user_begin(), - E = P.second->user_end(); - I != E; ++I) { - User *UI = *I; - if (isa(UI)) - continue; - else if ((SI = dyn_cast(UI)) && - P.second == SI->getPointerOperand()) - continue; - - for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) { - User *UJ = *J; - if ((SJ = dyn_cast(UJ)) && - P.second == SJ->getPointerOperand()) - continue; - - if (CandidatePairsSet.count(ValuePair(UI, UJ))) { - VPPair VP(P, ValuePair(UI, UJ)); - ConnectedPairs[VP.first].push_back(VP.second); - PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); - } - } - } - } - - // This function figures out which pairs are connected. Two pairs are - // connected if some output of the first pair forms an input to both members - // of the second pair. - void BBVectorize::computeConnectedPairs( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseMap &PairConnectionTypes) { - for (std::vector::iterator PI = PairableInsts.begin(), - PE = PairableInsts.end(); PI != PE; ++PI) { - DenseMap >::iterator PP = - CandidatePairs.find(*PI); - if (PP == CandidatePairs.end()) - continue; - - for (std::vector::iterator P = PP->second.begin(), - E = PP->second.end(); P != E; ++P) - computePairsConnectedTo(CandidatePairs, CandidatePairsSet, - PairableInsts, ConnectedPairs, - PairConnectionTypes, ValuePair(*PI, *P)); - } - - DEBUG(size_t TotalPairs = 0; - for (DenseMap >::iterator I = - ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) - TotalPairs += I->second.size(); - dbgs() << "BBV: found " << TotalPairs - << " pair connections.\n"); - } - - // This function builds a set of use tuples such that is in the set - // if B is in the use dag of A. If B is in the use dag of A, then B - // depends on the output of A. - void BBVectorize::buildDepMap( - BasicBlock &BB, - DenseMap > &CandidatePairs, - std::vector &PairableInsts, - DenseSet &PairableInstUsers) { - DenseSet IsInPair; - for (DenseMap >::iterator C = - CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { - IsInPair.insert(C->first); - IsInPair.insert(C->second.begin(), C->second.end()); - } - - // Iterate through the basic block, recording all users of each - // pairable instruction. - - BasicBlock::iterator E = BB.end(), EL = - BasicBlock::iterator(cast(PairableInsts.back())); - for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { - if (IsInPair.find(&*I) == IsInPair.end()) - continue; - - DenseSet Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) - WriteSet.add(&*I); - - for (BasicBlock::iterator J = std::next(I); J != E; ++J) { - (void)trackUsesOfI(Users, WriteSet, &*I, &*J); - - if (J == EL) - break; - } - - for (DenseSet::iterator U = Users.begin(), E = Users.end(); - U != E; ++U) { - if (IsInPair.find(*U) == IsInPair.end()) continue; - PairableInstUsers.insert(ValuePair(&*I, *U)); - } - - if (I == EL) - break; - } - } - - // Returns true if an input to pair P is an output of pair Q and also an - // input of pair Q is an output of pair P. If this is the case, then these - // two pairs cannot be simultaneously fused. - bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, - DenseSet &PairableInstUsers, - DenseMap > *PairableInstUserMap, - DenseSet *PairableInstUserPairSet) { - // Two pairs are in conflict if they are mutual Users of eachother. - bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || - PairableInstUsers.count(ValuePair(P.first, Q.second)) || - PairableInstUsers.count(ValuePair(P.second, Q.first)) || - PairableInstUsers.count(ValuePair(P.second, Q.second)); - bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || - PairableInstUsers.count(ValuePair(Q.first, P.second)) || - PairableInstUsers.count(ValuePair(Q.second, P.first)) || - PairableInstUsers.count(ValuePair(Q.second, P.second)); - if (PairableInstUserMap) { - // FIXME: The expensive part of the cycle check is not so much the cycle - // check itself but this edge insertion procedure. This needs some - // profiling and probably a different data structure. - if (PUsesQ) { - if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) - (*PairableInstUserMap)[Q].push_back(P); - } - if (QUsesP) { - if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) - (*PairableInstUserMap)[P].push_back(Q); - } - } - - return (QUsesP && PUsesQ); - } - - // This function walks the use graph of current pairs to see if, starting - // from P, the walk returns to P. - bool BBVectorize::pairWillFormCycle(ValuePair P, - DenseMap > &PairableInstUserMap, - DenseSet &CurrentPairs) { - DEBUG(if (DebugCycleCheck) - dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " - << *P.second << "\n"); - // A lookup table of visisted pairs is kept because the PairableInstUserMap - // contains non-direct associations. - DenseSet Visited; - SmallVector Q; - // General depth-first post-order traversal: - Q.push_back(P); - do { - ValuePair QTop = Q.pop_back_val(); - Visited.insert(QTop); - - DEBUG(if (DebugCycleCheck) - dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " - << *QTop.second << "\n"); - DenseMap >::iterator QQ = - PairableInstUserMap.find(QTop); - if (QQ == PairableInstUserMap.end()) - continue; - - for (std::vector::iterator C = QQ->second.begin(), - CE = QQ->second.end(); C != CE; ++C) { - if (*C == P) { - DEBUG(dbgs() - << "BBV: rejected to prevent non-trivial cycle formation: " - << QTop.first << " <-> " << C->second << "\n"); - return true; - } - - if (CurrentPairs.count(*C) && !Visited.count(*C)) - Q.push_back(*C); - } - } while (!Q.empty()); - - return false; - } - - // This function builds the initial dag of connected pairs with the - // pair J at the root. - void BBVectorize::buildInitialDAGFor( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseSet &PairableInstUsers, - DenseMap &ChosenPairs, - DenseMap &DAG, ValuePair J) { - // Each of these pairs is viewed as the root node of a DAG. The DAG - // is then walked (depth-first). As this happens, we keep track of - // the pairs that compose the DAG and the maximum depth of the DAG. - SmallVector Q; - // General depth-first post-order traversal: - Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); - do { - ValuePairWithDepth QTop = Q.back(); - - // Push each child onto the queue: - bool MoreChildren = false; - size_t MaxChildDepth = QTop.second; - DenseMap >::iterator QQ = - ConnectedPairs.find(QTop.first); - if (QQ != ConnectedPairs.end()) - for (std::vector::iterator k = QQ->second.begin(), - ke = QQ->second.end(); k != ke; ++k) { - // Make sure that this child pair is still a candidate: - if (CandidatePairsSet.count(*k)) { - DenseMap::iterator C = DAG.find(*k); - if (C == DAG.end()) { - size_t d = getDepthFactor(k->first); - Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); - MoreChildren = true; - } else { - MaxChildDepth = std::max(MaxChildDepth, C->second); - } - } - } - - if (!MoreChildren) { - // Record the current pair as part of the DAG: - DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); - Q.pop_back(); - } - } while (!Q.empty()); - } - - // Given some initial dag, prune it by removing conflicting pairs (pairs - // that cannot be simultaneously chosen for vectorization). - void BBVectorize::pruneDAGFor( - DenseMap > &CandidatePairs, - std::vector &PairableInsts, - DenseMap > &ConnectedPairs, - DenseSet &PairableInstUsers, - DenseMap > &PairableInstUserMap, - DenseSet &PairableInstUserPairSet, - DenseMap &ChosenPairs, - DenseMap &DAG, - DenseSet &PrunedDAG, ValuePair J, - bool UseCycleCheck) { - SmallVector Q; - // General depth-first post-order traversal: - Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); - do { - ValuePairWithDepth QTop = Q.pop_back_val(); - PrunedDAG.insert(QTop.first); - - // Visit each child, pruning as necessary... - SmallVector BestChildren; - DenseMap >::iterator QQ = - ConnectedPairs.find(QTop.first); - if (QQ == ConnectedPairs.end()) - continue; - - for (std::vector::iterator K = QQ->second.begin(), - KE = QQ->second.end(); K != KE; ++K) { - DenseMap::iterator C = DAG.find(*K); - if (C == DAG.end()) continue; - - // This child is in the DAG, now we need to make sure it is the - // best of any conflicting children. There could be multiple - // conflicting children, so first, determine if we're keeping - // this child, then delete conflicting children as necessary. - - // It is also necessary to guard against pairing-induced - // dependencies. Consider instructions a .. x .. y .. b - // such that (a,b) are to be fused and (x,y) are to be fused - // but a is an input to x and b is an output from y. This - // means that y cannot be moved after b but x must be moved - // after b for (a,b) to be fused. In other words, after - // fusing (a,b) we have y .. a/b .. x where y is an input - // to a/b and x is an output to a/b: x and y can no longer - // be legally fused. To prevent this condition, we must - // make sure that a child pair added to the DAG is not - // both an input and output of an already-selected pair. - - // Pairing-induced dependencies can also form from more complicated - // cycles. The pair vs. pair conflicts are easy to check, and so - // that is done explicitly for "fast rejection", and because for - // child vs. child conflicts, we may prefer to keep the current - // pair in preference to the already-selected child. - DenseSet CurrentPairs; - - bool CanAdd = true; - for (SmallVectorImpl::iterator C2 - = BestChildren.begin(), E2 = BestChildren.end(); - C2 != E2; ++C2) { - if (C2->first.first == C->first.first || - C2->first.first == C->first.second || - C2->first.second == C->first.first || - C2->first.second == C->first.second || - pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - if (C2->second >= C->second) { - CanAdd = false; - break; - } - - CurrentPairs.insert(C2->first); - } - } - if (!CanAdd) continue; - - // Even worse, this child could conflict with another node already - // selected for the DAG. If that is the case, ignore this child. - for (DenseSet::iterator T = PrunedDAG.begin(), - E2 = PrunedDAG.end(); T != E2; ++T) { - if (T->first == C->first.first || - T->first == C->first.second || - T->second == C->first.first || - T->second == C->first.second || - pairsConflict(*T, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - CanAdd = false; - break; - } - - CurrentPairs.insert(*T); - } - if (!CanAdd) continue; - - // And check the queue too... - for (SmallVectorImpl::iterator C2 = Q.begin(), - E2 = Q.end(); C2 != E2; ++C2) { - if (C2->first.first == C->first.first || - C2->first.first == C->first.second || - C2->first.second == C->first.first || - C2->first.second == C->first.second || - pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - CanAdd = false; - break; - } - - CurrentPairs.insert(C2->first); - } - if (!CanAdd) continue; - - // Last but not least, check for a conflict with any of the - // already-chosen pairs. - for (DenseMap::iterator C2 = - ChosenPairs.begin(), E2 = ChosenPairs.end(); - C2 != E2; ++C2) { - if (pairsConflict(*C2, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet - : nullptr)) { - CanAdd = false; - break; - } - - CurrentPairs.insert(*C2); - } - if (!CanAdd) continue; - - // To check for non-trivial cycles formed by the addition of the - // current pair we've formed a list of all relevant pairs, now use a - // graph walk to check for a cycle. We start from the current pair and - // walk the use dag to see if we again reach the current pair. If we - // do, then the current pair is rejected. - - // FIXME: It may be more efficient to use a topological-ordering - // algorithm to improve the cycle check. This should be investigated. - if (UseCycleCheck && - pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) - continue; - - // This child can be added, but we may have chosen it in preference - // to an already-selected child. Check for this here, and if a - // conflict is found, then remove the previously-selected child - // before adding this one in its place. - for (SmallVectorImpl::iterator C2 - = BestChildren.begin(); C2 != BestChildren.end();) { - if (C2->first.first == C->first.first || - C2->first.first == C->first.second || - C2->first.second == C->first.first || - C2->first.second == C->first.second || - pairsConflict(C2->first, C->first, PairableInstUsers)) - C2 = BestChildren.erase(C2); - else - ++C2; - } - - BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); - } - - for (SmallVectorImpl::iterator C - = BestChildren.begin(), E2 = BestChildren.end(); - C != E2; ++C) { - size_t DepthF = getDepthFactor(C->first.first); - Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); - } - } while (!Q.empty()); - } - - // This function finds the best dag of mututally-compatible connected - // pairs, given the choice of root pairs as an iterator range. - void BBVectorize::findBestDAGFor( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - DenseMap &CandidatePairCostSavings, - std::vector &PairableInsts, - DenseSet &FixedOrderPairs, - DenseMap &PairConnectionTypes, - DenseMap > &ConnectedPairs, - DenseMap > &ConnectedPairDeps, - DenseSet &PairableInstUsers, - DenseMap > &PairableInstUserMap, - DenseSet &PairableInstUserPairSet, - DenseMap &ChosenPairs, - DenseSet &BestDAG, size_t &BestMaxDepth, - int &BestEffSize, Value *II, std::vector&JJ, - bool UseCycleCheck) { - for (std::vector::iterator J = JJ.begin(), JE = JJ.end(); - J != JE; ++J) { - ValuePair IJ(II, *J); - if (!CandidatePairsSet.count(IJ)) - continue; - - // Before going any further, make sure that this pair does not - // conflict with any already-selected pairs (see comment below - // near the DAG pruning for more details). - DenseSet ChosenPairSet; - bool DoesConflict = false; - for (DenseMap::iterator C = ChosenPairs.begin(), - E = ChosenPairs.end(); C != E; ++C) { - if (pairsConflict(*C, IJ, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : nullptr, - UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { - DoesConflict = true; - break; - } - - ChosenPairSet.insert(*C); - } - if (DoesConflict) continue; - - if (UseCycleCheck && - pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) - continue; - - DenseMap DAG; - buildInitialDAGFor(CandidatePairs, CandidatePairsSet, - PairableInsts, ConnectedPairs, - PairableInstUsers, ChosenPairs, DAG, IJ); - - // Because we'll keep the child with the largest depth, the largest - // depth is still the same in the unpruned DAG. - size_t MaxDepth = DAG.lookup(IJ); - - DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" - << *IJ.first << " <-> " << *IJ.second << "} of depth " << - MaxDepth << " and size " << DAG.size() << "\n"); - - // At this point the DAG has been constructed, but, may contain - // contradictory children (meaning that different children of - // some dag node may be attempting to fuse the same instruction). - // So now we walk the dag again, in the case of a conflict, - // keep only the child with the largest depth. To break a tie, - // favor the first child. - - DenseSet PrunedDAG; - pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, - PairableInstUsers, PairableInstUserMap, - PairableInstUserPairSet, - ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); - - int EffSize = 0; - if (TTI) { - DenseSet PrunedDAGInstrs; - for (DenseSet::iterator S = PrunedDAG.begin(), - E = PrunedDAG.end(); S != E; ++S) { - PrunedDAGInstrs.insert(S->first); - PrunedDAGInstrs.insert(S->second); - } - - // The set of pairs that have already contributed to the total cost. - DenseSet IncomingPairs; - - // If the cost model were perfect, this might not be necessary; but we - // need to make sure that we don't get stuck vectorizing our own - // shuffle chains. - bool HasNontrivialInsts = false; - - // The node weights represent the cost savings associated with - // fusing the pair of instructions. - for (DenseSet::iterator S = PrunedDAG.begin(), - E = PrunedDAG.end(); S != E; ++S) { - if (!isa(S->first) && - !isa(S->first) && - !isa(S->first)) - HasNontrivialInsts = true; - - bool FlipOrder = false; - - if (getDepthFactor(S->first)) { - int ESContrib = CandidatePairCostSavings.find(*S)->second; - DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" - << *S->first << " <-> " << *S->second << "} = " << - ESContrib << "\n"); - EffSize += ESContrib; - } - - // The edge weights contribute in a negative sense: they represent - // the cost of shuffles. - DenseMap >::iterator SS = - ConnectedPairDeps.find(*S); - if (SS != ConnectedPairDeps.end()) { - unsigned NumDepsDirect = 0, NumDepsSwap = 0; - for (std::vector::iterator T = SS->second.begin(), - TE = SS->second.end(); T != TE; ++T) { - VPPair Q(*S, *T); - if (!PrunedDAG.count(Q.second)) - continue; - DenseMap::iterator R = - PairConnectionTypes.find(VPPair(Q.second, Q.first)); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - if (R->second == PairConnectionDirect) - ++NumDepsDirect; - else if (R->second == PairConnectionSwap) - ++NumDepsSwap; - } - - // If there are more swaps than direct connections, then - // the pair order will be flipped during fusion. So the real - // number of swaps is the minimum number. - FlipOrder = !FixedOrderPairs.count(*S) && - ((NumDepsSwap > NumDepsDirect) || - FixedOrderPairs.count(ValuePair(S->second, S->first))); - - for (std::vector::iterator T = SS->second.begin(), - TE = SS->second.end(); T != TE; ++T) { - VPPair Q(*S, *T); - if (!PrunedDAG.count(Q.second)) - continue; - DenseMap::iterator R = - PairConnectionTypes.find(VPPair(Q.second, Q.first)); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - Type *Ty1 = Q.second.first->getType(), - *Ty2 = Q.second.second->getType(); - Type *VTy = getVecTypeForPair(Ty1, Ty2); - if ((R->second == PairConnectionDirect && FlipOrder) || - (R->second == PairConnectionSwap && !FlipOrder) || - R->second == PairConnectionSplat) { - int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - VTy, VTy); - - if (VTy->getVectorNumElements() == 2) { - if (R->second == PairConnectionSplat) - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_Broadcast, VTy)); - else - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_Reverse, VTy)); - } - - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << - *Q.second.first << " <-> " << *Q.second.second << - "} -> {" << - *S->first << " <-> " << *S->second << "} = " << - ESContrib << "\n"); - EffSize -= ESContrib; - } - } - } - - // Compute the cost of outgoing edges. We assume that edges outgoing - // to shuffles, inserts or extracts can be merged, and so contribute - // no additional cost. - if (!S->first->getType()->isVoidTy()) { - Type *Ty1 = S->first->getType(), - *Ty2 = S->second->getType(); - Type *VTy = getVecTypeForPair(Ty1, Ty2); - - bool NeedsExtraction = false; - for (User *U : S->first->users()) { - if (ShuffleVectorInst *SI = dyn_cast(U)) { - // Shuffle can be folded if it has no other input - if (isa(SI->getOperand(1))) - continue; - } - if (isa(U)) - continue; - if (PrunedDAGInstrs.count(U)) - continue; - NeedsExtraction = true; - break; - } - - if (NeedsExtraction) { - int ESContrib; - if (Ty1->isVectorTy()) { - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - Ty1, VTy); - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); - } else - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::ExtractElement, VTy, 0); - - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << - *S->first << "} = " << ESContrib << "\n"); - EffSize -= ESContrib; - } - - NeedsExtraction = false; - for (User *U : S->second->users()) { - if (ShuffleVectorInst *SI = dyn_cast(U)) { - // Shuffle can be folded if it has no other input - if (isa(SI->getOperand(1))) - continue; - } - if (isa(U)) - continue; - if (PrunedDAGInstrs.count(U)) - continue; - NeedsExtraction = true; - break; - } - - if (NeedsExtraction) { - int ESContrib; - if (Ty2->isVectorTy()) { - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - Ty2, VTy); - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, VTy, - Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); - } else - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::ExtractElement, VTy, 1); - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << - *S->second << "} = " << ESContrib << "\n"); - EffSize -= ESContrib; - } - } - - // Compute the cost of incoming edges. - if (!isa(S->first) && !isa(S->first)) { - Instruction *S1 = cast(S->first), - *S2 = cast(S->second); - for (unsigned o = 0; o < S1->getNumOperands(); ++o) { - Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); - - // Combining constants into vector constants (or small vector - // constants into larger ones are assumed free). - if (isa(O1) && isa(O2)) - continue; - - if (FlipOrder) - std::swap(O1, O2); - - ValuePair VP = ValuePair(O1, O2); - ValuePair VPR = ValuePair(O2, O1); - - // Internal edges are not handled here. - if (PrunedDAG.count(VP) || PrunedDAG.count(VPR)) - continue; - - Type *Ty1 = O1->getType(), - *Ty2 = O2->getType(); - Type *VTy = getVecTypeForPair(Ty1, Ty2); - - // Combining vector operations of the same type is also assumed - // folded with other operations. - if (Ty1 == Ty2) { - // If both are insert elements, then both can be widened. - InsertElementInst *IEO1 = dyn_cast(O1), - *IEO2 = dyn_cast(O2); - if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) - continue; - // If both are extract elements, and both have the same input - // type, then they can be replaced with a shuffle - ExtractElementInst *EIO1 = dyn_cast(O1), - *EIO2 = dyn_cast(O2); - if (EIO1 && EIO2 && - EIO1->getOperand(0)->getType() == - EIO2->getOperand(0)->getType()) - continue; - // If both are a shuffle with equal operand types and only two - // unqiue operands, then they can be replaced with a single - // shuffle - ShuffleVectorInst *SIO1 = dyn_cast(O1), - *SIO2 = dyn_cast(O2); - if (SIO1 && SIO2 && - SIO1->getOperand(0)->getType() == - SIO2->getOperand(0)->getType()) { - SmallSet SIOps; - SIOps.insert(SIO1->getOperand(0)); - SIOps.insert(SIO1->getOperand(1)); - SIOps.insert(SIO2->getOperand(0)); - SIOps.insert(SIO2->getOperand(1)); - if (SIOps.size() <= 2) - continue; - } - } - - int ESContrib; - // This pair has already been formed. - if (IncomingPairs.count(VP)) { - continue; - } else if (IncomingPairs.count(VPR)) { - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - VTy, VTy); - - if (VTy->getVectorNumElements() == 2) - ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( - TargetTransformInfo::SK_Reverse, VTy)); - } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::InsertElement, VTy, 0); - ESContrib += (int) TTI->getVectorInstrCost( - Instruction::InsertElement, VTy, 1); - } else if (!Ty1->isVectorTy()) { - // O1 needs to be inserted into a vector of size O2, and then - // both need to be shuffled together. - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::InsertElement, Ty2, 0); - ESContrib += (int) getInstrCost(Instruction::ShuffleVector, - VTy, Ty2); - } else if (!Ty2->isVectorTy()) { - // O2 needs to be inserted into a vector of size O1, and then - // both need to be shuffled together. - ESContrib = (int) TTI->getVectorInstrCost( - Instruction::InsertElement, Ty1, 0); - ESContrib += (int) getInstrCost(Instruction::ShuffleVector, - VTy, Ty1); - } else { - Type *TyBig = Ty1, *TySmall = Ty2; - if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) - std::swap(TyBig, TySmall); - - ESContrib = (int) getInstrCost(Instruction::ShuffleVector, - VTy, TyBig); - if (TyBig != TySmall) - ESContrib += (int) getInstrCost(Instruction::ShuffleVector, - TyBig, TySmall); - } - - DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" - << *O1 << " <-> " << *O2 << "} = " << - ESContrib << "\n"); - EffSize -= ESContrib; - IncomingPairs.insert(VP); - } - } - } - - if (!HasNontrivialInsts) { - DEBUG(if (DebugPairSelection) dbgs() << - "\tNo non-trivial instructions in DAG;" - " override to zero effective size\n"); - EffSize = 0; - } - } else { - for (DenseSet::iterator S = PrunedDAG.begin(), - E = PrunedDAG.end(); S != E; ++S) - EffSize += (int) getDepthFactor(S->first); - } - - DEBUG(if (DebugPairSelection) - dbgs() << "BBV: found pruned DAG for pair {" - << *IJ.first << " <-> " << *IJ.second << "} of depth " << - MaxDepth << " and size " << PrunedDAG.size() << - " (effective size: " << EffSize << ")\n"); - if (((TTI && !UseChainDepthWithTI) || - MaxDepth >= Config.ReqChainDepth) && - EffSize > 0 && EffSize > BestEffSize) { - BestMaxDepth = MaxDepth; - BestEffSize = EffSize; - BestDAG = PrunedDAG; - } - } - } - - // Given the list of candidate pairs, this function selects those - // that will be fused into vector instructions. - void BBVectorize::choosePairs( - DenseMap > &CandidatePairs, - DenseSet &CandidatePairsSet, - DenseMap &CandidatePairCostSavings, - std::vector &PairableInsts, - DenseSet &FixedOrderPairs, - DenseMap &PairConnectionTypes, - DenseMap > &ConnectedPairs, - DenseMap > &ConnectedPairDeps, - DenseSet &PairableInstUsers, - DenseMap& ChosenPairs) { - bool UseCycleCheck = - CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; - - DenseMap > CandidatePairs2; - for (DenseSet::iterator I = CandidatePairsSet.begin(), - E = CandidatePairsSet.end(); I != E; ++I) { - std::vector &JJ = CandidatePairs2[I->second]; - if (JJ.empty()) JJ.reserve(32); - JJ.push_back(I->first); - } - - DenseMap > PairableInstUserMap; - DenseSet PairableInstUserPairSet; - for (std::vector::iterator I = PairableInsts.begin(), - E = PairableInsts.end(); I != E; ++I) { - // The number of possible pairings for this variable: - size_t NumChoices = CandidatePairs.lookup(*I).size(); - if (!NumChoices) continue; - - std::vector &JJ = CandidatePairs[*I]; - - // The best pair to choose and its dag: - size_t BestMaxDepth = 0; - int BestEffSize = 0; - DenseSet BestDAG; - findBestDAGFor(CandidatePairs, CandidatePairsSet, - CandidatePairCostSavings, - PairableInsts, FixedOrderPairs, PairConnectionTypes, - ConnectedPairs, ConnectedPairDeps, - PairableInstUsers, PairableInstUserMap, - PairableInstUserPairSet, ChosenPairs, - BestDAG, BestMaxDepth, BestEffSize, *I, JJ, - UseCycleCheck); - - if (BestDAG.empty()) - continue; - - // A dag has been chosen (or not) at this point. If no dag was - // chosen, then this instruction, I, cannot be paired (and is no longer - // considered). - - DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: " - << *cast(*I) << "\n"); - - for (DenseSet::iterator S = BestDAG.begin(), - SE2 = BestDAG.end(); S != SE2; ++S) { - // Insert the members of this dag into the list of chosen pairs. - ChosenPairs.insert(ValuePair(S->first, S->second)); - DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << - *S->second << "\n"); - - // Remove all candidate pairs that have values in the chosen dag. - std::vector &KK = CandidatePairs[S->first]; - for (std::vector::iterator K = KK.begin(), KE = KK.end(); - K != KE; ++K) { - if (*K == S->second) - continue; - - CandidatePairsSet.erase(ValuePair(S->first, *K)); - } - - std::vector &LL = CandidatePairs2[S->second]; - for (std::vector::iterator L = LL.begin(), LE = LL.end(); - L != LE; ++L) { - if (*L == S->first) - continue; - - CandidatePairsSet.erase(ValuePair(*L, S->second)); - } - - std::vector &MM = CandidatePairs[S->second]; - for (std::vector::iterator M = MM.begin(), ME = MM.end(); - M != ME; ++M) { - assert(*M != S->first && "Flipped pair in candidate list?"); - CandidatePairsSet.erase(ValuePair(S->second, *M)); - } - - std::vector &NN = CandidatePairs2[S->first]; - for (std::vector::iterator N = NN.begin(), NE = NN.end(); - N != NE; ++N) { - assert(*N != S->second && "Flipped pair in candidate list?"); - CandidatePairsSet.erase(ValuePair(*N, S->first)); - } - } - } - - DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); - } - - std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, - unsigned n = 0) { - if (!I->hasName()) - return ""; - - return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + - (n > 0 ? "." + utostr(n) : "")).str(); - } - - // Returns the value that is to be used as the pointer input to the vector - // instruction that fuses I with J. - Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, - Instruction *I, Instruction *J, unsigned o) { - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; - int64_t OffsetInElmts; - - // Note: the analysis might fail here, that is why the pair order has - // been precomputed (OffsetInElmts must be unused here). - (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - IAddressSpace, JAddressSpace, - OffsetInElmts, false); - - // The pointer value is taken to be the one with the lowest offset. - Value *VPtr = IPtr; - - Type *ArgTypeI = IPtr->getType()->getPointerElementType(); - Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); - Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - Type *VArgPtrType - = PointerType::get(VArgType, - IPtr->getType()->getPointerAddressSpace()); - return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), - /* insert before */ I); - } - - void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned MaskOffset, unsigned NumInElem, - unsigned NumInElem1, unsigned IdxOffset, - std::vector &Mask) { - unsigned NumElem1 = J->getType()->getVectorNumElements(); - for (unsigned v = 0; v < NumElem1; ++v) { - int m = cast(J)->getMaskValue(v); - if (m < 0) { - Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); - } else { - unsigned mm = m + (int) IdxOffset; - if (m >= (int) NumInElem1) - mm += (int) NumInElem; - - Mask[v+MaskOffset] = - ConstantInt::get(Type::getInt32Ty(Context), mm); - } - } - } - - // Returns the value that is to be used as the vector-shuffle mask to the - // vector instruction that fuses I with J. - Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, - Instruction *I, Instruction *J) { - // This is the shuffle mask. We need to append the second - // mask to the first, and the numbers need to be adjusted. - - Type *ArgTypeI = I->getType(); - Type *ArgTypeJ = J->getType(); - Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - - unsigned NumElemI = ArgTypeI->getVectorNumElements(); - - // Get the total number of elements in the fused vector type. - // By definition, this must equal the number of elements in - // the final mask. - unsigned NumElem = VArgType->getVectorNumElements(); - std::vector Mask(NumElem); - - Type *OpTypeI = I->getOperand(0)->getType(); - unsigned NumInElemI = OpTypeI->getVectorNumElements(); - Type *OpTypeJ = J->getOperand(0)->getType(); - unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); - - // The fused vector will be: - // ----------------------------------------------------- - // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | - // ----------------------------------------------------- - // from which we'll extract NumElem total elements (where the first NumElemI - // of them come from the mask in I and the remainder come from the mask - // in J. - - // For the mask from the first pair... - fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, - 0, Mask); - - // For the mask from the second pair... - fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, - NumInElemI, Mask); - - return ConstantVector::get(Mask); - } - - bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, Value *&LOp, - unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeH, - bool IBeforeJ, unsigned IdxOff) { - bool ExpandedIEChain = false; - if (InsertElementInst *LIE = dyn_cast(LOp)) { - // If we have a pure insertelement chain, then this can be rewritten - // into a chain that directly builds the larger type. - if (isPureIEChain(LIE)) { - SmallVector VectElemts(numElemL, - UndefValue::get(ArgTypeL->getScalarType())); - InsertElementInst *LIENext = LIE; - do { - unsigned Idx = - cast(LIENext->getOperand(2))->getSExtValue(); - VectElemts[Idx] = LIENext->getOperand(1); - } while ((LIENext = - dyn_cast(LIENext->getOperand(0)))); - - LIENext = nullptr; - Value *LIEPrev = UndefValue::get(ArgTypeH); - for (unsigned i = 0; i < numElemL; ++i) { - if (isa(VectElemts[i])) continue; - LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], - ConstantInt::get(Type::getInt32Ty(Context), - i + IdxOff), - getReplacementName(IBeforeJ ? I : J, - true, o, i+1)); - LIENext->insertBefore(IBeforeJ ? J : I); - LIEPrev = LIENext; - } - - LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); - ExpandedIEChain = true; - } - } - - return ExpandedIEChain; - } - - static unsigned getNumScalarElements(Type *Ty) { - if (VectorType *VecTy = dyn_cast(Ty)) - return VecTy->getNumElements(); - return 1; - } - - // Returns the value to be used as the specified operand of the vector - // instruction that fuses I with J. - Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool IBeforeJ) { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); - - // Compute the fused vector type for this operand - Type *ArgTypeI = I->getOperand(o)->getType(); - Type *ArgTypeJ = J->getOperand(o)->getType(); - VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - - Instruction *L = I, *H = J; - Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - - unsigned numElemL = getNumScalarElements(ArgTypeL); - unsigned numElemH = getNumScalarElements(ArgTypeH); - - Value *LOp = L->getOperand(o); - Value *HOp = H->getOperand(o); - unsigned numElem = VArgType->getNumElements(); - - // First, we check if we can reuse the "original" vector outputs (if these - // exist). We might need a shuffle. - ExtractElementInst *LEE = dyn_cast(LOp); - ExtractElementInst *HEE = dyn_cast(HOp); - ShuffleVectorInst *LSV = dyn_cast(LOp); - ShuffleVectorInst *HSV = dyn_cast(HOp); - - // FIXME: If we're fusing shuffle instructions, then we can't apply this - // optimization. The input vectors to the shuffle might be a different - // length from the shuffle outputs. Unfortunately, the replacement - // shuffle mask has already been formed, and the mask entries are sensitive - // to the sizes of the inputs. - bool IsSizeChangeShuffle = - isa(L) && - (LOp->getType() != L->getType() || HOp->getType() != H->getType()); - - if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { - // We can have at most two unique vector inputs. - bool CanUseInputs = true; - Value *I1, *I2 = nullptr; - if (LEE) { - I1 = LEE->getOperand(0); - } else { - I1 = LSV->getOperand(0); - I2 = LSV->getOperand(1); - if (I2 == I1 || isa(I2)) - I2 = nullptr; - } - - if (HEE) { - Value *I3 = HEE->getOperand(0); - if (!I2 && I3 != I1) - I2 = I3; - else if (I3 != I1 && I3 != I2) - CanUseInputs = false; - } else { - Value *I3 = HSV->getOperand(0); - if (!I2 && I3 != I1) - I2 = I3; - else if (I3 != I1 && I3 != I2) - CanUseInputs = false; - - if (CanUseInputs) { - Value *I4 = HSV->getOperand(1); - if (!isa(I4)) { - if (!I2 && I4 != I1) - I2 = I4; - else if (I4 != I1 && I4 != I2) - CanUseInputs = false; - } - } - } - - if (CanUseInputs) { - unsigned LOpElem = - cast(LOp)->getOperand(0)->getType() - ->getVectorNumElements(); - - unsigned HOpElem = - cast(HOp)->getOperand(0)->getType() - ->getVectorNumElements(); - - // We have one or two input vectors. We need to map each index of the - // operands to the index of the original vector. - SmallVector, 8> II(numElem); - for (unsigned i = 0; i < numElemL; ++i) { - int Idx, INum; - if (LEE) { - Idx = - cast(LEE->getOperand(1))->getSExtValue(); - INum = LEE->getOperand(0) == I1 ? 0 : 1; - } else { - Idx = LSV->getMaskValue(i); - if (Idx < (int) LOpElem) { - INum = LSV->getOperand(0) == I1 ? 0 : 1; - } else { - Idx -= LOpElem; - INum = LSV->getOperand(1) == I1 ? 0 : 1; - } - } - - II[i] = std::pair(Idx, INum); - } - for (unsigned i = 0; i < numElemH; ++i) { - int Idx, INum; - if (HEE) { - Idx = - cast(HEE->getOperand(1))->getSExtValue(); - INum = HEE->getOperand(0) == I1 ? 0 : 1; - } else { - Idx = HSV->getMaskValue(i); - if (Idx < (int) HOpElem) { - INum = HSV->getOperand(0) == I1 ? 0 : 1; - } else { - Idx -= HOpElem; - INum = HSV->getOperand(1) == I1 ? 0 : 1; - } - } - - II[i + numElemL] = std::pair(Idx, INum); - } - - // We now have an array which tells us from which index of which - // input vector each element of the operand comes. - VectorType *I1T = cast(I1->getType()); - unsigned I1Elem = I1T->getNumElements(); - - if (!I2) { - // In this case there is only one underlying vector input. Check for - // the trivial case where we can use the input directly. - if (I1Elem == numElem) { - bool ElemInOrder = true; - for (unsigned i = 0; i < numElem; ++i) { - if (II[i].first != (int) i && II[i].first != -1) { - ElemInOrder = false; - break; - } - } - - if (ElemInOrder) - return I1; - } - - // A shuffle is needed. - std::vector Mask(numElem); - for (unsigned i = 0; i < numElem; ++i) { - int Idx = II[i].first; - if (Idx == -1) - Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); - else - Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); - } - - Instruction *S = - new ShuffleVectorInst(I1, UndefValue::get(I1T), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o)); - S->insertBefore(IBeforeJ ? J : I); - return S; - } - - VectorType *I2T = cast(I2->getType()); - unsigned I2Elem = I2T->getNumElements(); - - // This input comes from two distinct vectors. The first step is to - // make sure that both vectors are the same length. If not, the - // smaller one will need to grow before they can be shuffled together. - if (I1Elem < I2Elem) { - std::vector Mask(I2Elem); - unsigned v = 0; - for (; v < I1Elem; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < I2Elem; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - Instruction *NewI1 = - new ShuffleVectorInst(I1, UndefValue::get(I1T), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - NewI1->insertBefore(IBeforeJ ? J : I); - I1 = NewI1; - I1Elem = I2Elem; - } else if (I1Elem > I2Elem) { - std::vector Mask(I1Elem); - unsigned v = 0; - for (; v < I2Elem; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < I1Elem; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - Instruction *NewI2 = - new ShuffleVectorInst(I2, UndefValue::get(I2T), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - NewI2->insertBefore(IBeforeJ ? J : I); - I2 = NewI2; - } - - // Now that both I1 and I2 are the same length we can shuffle them - // together (and use the result). - std::vector Mask(numElem); - for (unsigned v = 0; v < numElem; ++v) { - if (II[v].first == -1) { - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - } else { - int Idx = II[v].first + II[v].second * I1Elem; - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); - } - } - - Instruction *NewOp = - new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, true, o)); - NewOp->insertBefore(IBeforeJ ? J : I); - return NewOp; - } - } - - Type *ArgType = ArgTypeL; - if (numElemL < numElemH) { - if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, - ArgTypeL, VArgType, IBeforeJ, 1)) { - // This is another short-circuit case: we're combining a scalar into - // a vector that is formed by an IE chain. We've just expanded the IE - // chain, now insert the scalar and we're done. - - Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, - getReplacementName(IBeforeJ ? I : J, true, o)); - S->insertBefore(IBeforeJ ? J : I); - return S; - } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, - ArgTypeH, IBeforeJ)) { - // The two vector inputs to the shuffle must be the same length, - // so extend the smaller vector to be the same length as the larger one. - Instruction *NLOp; - if (numElemL > 1) { - - std::vector Mask(numElemH); - unsigned v = 0; - for (; v < numElemL; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < numElemH; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } else { - NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } - - NLOp->insertBefore(IBeforeJ ? J : I); - LOp = NLOp; - } - - ArgType = ArgTypeH; - } else if (numElemL > numElemH) { - if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, - ArgTypeH, VArgType, IBeforeJ)) { - Instruction *S = - InsertElementInst::Create(LOp, HOp, - ConstantInt::get(Type::getInt32Ty(Context), - numElemL), - getReplacementName(IBeforeJ ? I : J, - true, o)); - S->insertBefore(IBeforeJ ? J : I); - return S; - } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, - ArgTypeL, IBeforeJ)) { - Instruction *NHOp; - if (numElemH > 1) { - std::vector Mask(numElemL); - unsigned v = 0; - for (; v < numElemH; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - for (; v < numElemL; ++v) - Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - - NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } else { - NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - } - - NHOp->insertBefore(IBeforeJ ? J : I); - HOp = NHOp; - } - } - - if (ArgType->isVectorTy()) { - unsigned numElem = VArgType->getVectorNumElements(); - std::vector Mask(numElem); - for (unsigned v = 0; v < numElem; ++v) { - unsigned Idx = v; - // If the low vector was expanded, we need to skip the extra - // undefined entries. - if (v >= numElemL && numElemH > numElemL) - Idx += (numElemH - numElemL); - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); - } - - Instruction *BV = new ShuffleVectorInst(LOp, HOp, - ConstantVector::get(Mask), - getReplacementName(IBeforeJ ? I : J, true, o)); - BV->insertBefore(IBeforeJ ? J : I); - return BV; - } - - Instruction *BV1 = InsertElementInst::Create( - UndefValue::get(VArgType), LOp, CV0, - getReplacementName(IBeforeJ ? I : J, - true, o, 1)); - BV1->insertBefore(IBeforeJ ? J : I); - Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, - getReplacementName(IBeforeJ ? I : J, - true, o, 2)); - BV2->insertBefore(IBeforeJ ? J : I); - return BV2; - } - - // This function creates an array of values that will be used as the inputs - // to the vector instruction that fuses I with J. - void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, - Instruction *I, Instruction *J, - SmallVectorImpl &ReplacedOperands, - bool IBeforeJ) { - unsigned NumOperands = I->getNumOperands(); - - for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { - // Iterate backward so that we look at the store pointer - // first and know whether or not we need to flip the inputs. - - if (isa(I) || (o == 1 && isa(I))) { - // This is the pointer for a load/store instruction. - ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); - continue; - } else if (isa(I)) { - Function *F = cast(I)->getCalledFunction(); - Intrinsic::ID IID = F->getIntrinsicID(); - if (o == NumOperands-1) { - BasicBlock &BB = *I->getParent(); - - Module *M = BB.getParent()->getParent(); - Type *ArgTypeI = I->getType(); - Type *ArgTypeJ = J->getType(); - Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - - ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); - continue; - } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && o == 1) { - // The second argument of powi/ctlz/cttz is a single integer/constant - // and we've already checked that both arguments are equal. - // As a result, we just keep I's second argument. - ReplacedOperands[o] = I->getOperand(o); - continue; - } - } else if (isa(I) && o == NumOperands-1) { - ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); - continue; - } - - ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); - } - } - - // This function creates two values that represent the outputs of the - // original I and J instructions. These are generally vector shuffles - // or extracts. In many cases, these will end up being unused and, thus, - // eliminated by later passes. - void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, - Instruction *J, Instruction *K, - Instruction *&InsertionPt, - Instruction *&K1, Instruction *&K2) { - if (isa(I)) - return; - - Type *IType = I->getType(); - Type *JType = J->getType(); - - VectorType *VType = getVecTypeForPair(IType, JType); - unsigned numElem = VType->getNumElements(); - - unsigned numElemI = getNumScalarElements(IType); - unsigned numElemJ = getNumScalarElements(JType); - - if (IType->isVectorTy()) { - std::vector Mask1(numElemI), Mask2(numElemI); - for (unsigned v = 0; v < numElemI; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v); - } - - K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get(Mask1), - getReplacementName(K, false, 1)); - } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); - } - - if (JType->isVectorTy()) { - std::vector Mask1(numElemJ), Mask2(numElemJ); - for (unsigned v = 0; v < numElemJ; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v); - } - - K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get(Mask2), - getReplacementName(K, false, 2)); - } else { - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); - K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); - } - - K1->insertAfter(K); - K2->insertAfter(K1); - InsertionPt = K2; - } - - // Move all uses of the function I (including pairing-induced uses) after J. - bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, - DenseSet &LoadMoveSetPairs, - Instruction *I, Instruction *J) { - // Skip to the first instruction past I. - BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); - - DenseSet Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); - - for (; cast(L) != J; ++L) - (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs); - - assert(cast(L) == J && - "Tracking has not proceeded far enough to check for dependencies"); - // If J is now in the use set of I, then trackUsesOfI will return true - // and we have a dependency cycle (and the fusing operation must abort). - return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); - } - - // Move all uses of the function I (including pairing-induced uses) after J. - void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, - DenseSet &LoadMoveSetPairs, - Instruction *&InsertionPt, - Instruction *I, Instruction *J) { - // Skip to the first instruction past I. - BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); - - DenseSet Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); - - for (; cast(L) != J;) { - if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) { - // Move this instruction - Instruction *InstToMove = &*L++; - - DEBUG(dbgs() << "BBV: moving: " << *InstToMove << - " to after " << *InsertionPt << "\n"); - InstToMove->removeFromParent(); - InstToMove->insertAfter(InsertionPt); - InsertionPt = InstToMove; - } else { - ++L; - } - } - } - - // Collect all load instruction that are in the move set of a given first - // pair member. These loads depend on the first instruction, I, and so need - // to be moved after J (the second instruction) when the pair is fused. - void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, - DenseMap &ChosenPairs, - DenseMap > &LoadMoveSet, - DenseSet &LoadMoveSetPairs, - Instruction *I) { - // Skip to the first instruction past I. - BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); - - DenseSet Users; - AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); - - // Note: We cannot end the loop when we reach J because J could be moved - // farther down the use chain by another instruction pairing. Also, J - // could be before I if this is an inverted input. - for (BasicBlock::iterator E = BB.end(); L != E; ++L) { - if (trackUsesOfI(Users, WriteSet, I, &*L)) { - if (L->mayReadFromMemory()) { - LoadMoveSet[&*L].push_back(I); - LoadMoveSetPairs.insert(ValuePair(&*L, I)); - } - } - } - } - - // In cases where both load/stores and the computation of their pointers - // are chosen for vectorization, we can end up in a situation where the - // aliasing analysis starts returning different query results as the - // process of fusing instruction pairs continues. Because the algorithm - // relies on finding the same use dags here as were found earlier, we'll - // need to precompute the necessary aliasing information here and then - // manually update it during the fusion process. - void BBVectorize::collectLoadMoveSet(BasicBlock &BB, - std::vector &PairableInsts, - DenseMap &ChosenPairs, - DenseMap > &LoadMoveSet, - DenseSet &LoadMoveSetPairs) { - for (std::vector::iterator PI = PairableInsts.begin(), - PIE = PairableInsts.end(); PI != PIE; ++PI) { - DenseMap::iterator P = ChosenPairs.find(*PI); - if (P == ChosenPairs.end()) continue; - - Instruction *I = cast(P->first); - collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, - LoadMoveSetPairs, I); - } - } - - // This function fuses the chosen instruction pairs into vector instructions, - // taking care preserve any needed scalar outputs and, then, it reorders the - // remaining instructions as needed (users of the first member of the pair - // need to be moved to after the location of the second member of the pair - // because the vector instruction is inserted in the location of the pair's - // second member). - void BBVectorize::fuseChosenPairs(BasicBlock &BB, - std::vector &PairableInsts, - DenseMap &ChosenPairs, - DenseSet &FixedOrderPairs, - DenseMap &PairConnectionTypes, - DenseMap > &ConnectedPairs, - DenseMap > &ConnectedPairDeps) { - LLVMContext& Context = BB.getContext(); - - // During the vectorization process, the order of the pairs to be fused - // could be flipped. So we'll add each pair, flipped, into the ChosenPairs - // list. After a pair is fused, the flipped pair is removed from the list. - DenseSet FlippedPairs; - for (DenseMap::iterator P = ChosenPairs.begin(), - E = ChosenPairs.end(); P != E; ++P) - FlippedPairs.insert(ValuePair(P->second, P->first)); - for (DenseSet::iterator P = FlippedPairs.begin(), - E = FlippedPairs.end(); P != E; ++P) - ChosenPairs.insert(*P); - - DenseMap > LoadMoveSet; - DenseSet LoadMoveSetPairs; - collectLoadMoveSet(BB, PairableInsts, ChosenPairs, - LoadMoveSet, LoadMoveSetPairs); - - DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); - - for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { - DenseMap::iterator P = ChosenPairs.find(&*PI); - if (P == ChosenPairs.end()) { - ++PI; - continue; - } - - if (getDepthFactor(P->first) == 0) { - // These instructions are not really fused, but are tracked as though - // they are. Any case in which it would be interesting to fuse them - // will be taken care of by InstCombine. - --NumFusedOps; - ++PI; - continue; - } - - Instruction *I = cast(P->first), - *J = cast(P->second); - - DEBUG(dbgs() << "BBV: fusing: " << *I << - " <-> " << *J << "\n"); - - // Remove the pair and flipped pair from the list. - DenseMap::iterator FP = ChosenPairs.find(P->second); - assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); - ChosenPairs.erase(FP); - ChosenPairs.erase(P); - - if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { - DEBUG(dbgs() << "BBV: fusion of: " << *I << - " <-> " << *J << - " aborted because of non-trivial dependency cycle\n"); - --NumFusedOps; - ++PI; - continue; - } - - // If the pair must have the other order, then flip it. - bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); - if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { - // This pair does not have a fixed order, and so we might want to - // flip it if that will yield fewer shuffles. We count the number - // of dependencies connected via swaps, and those directly connected, - // and flip the order if the number of swaps is greater. - bool OrigOrder = true; - DenseMap >::iterator IJ = - ConnectedPairDeps.find(ValuePair(I, J)); - if (IJ == ConnectedPairDeps.end()) { - IJ = ConnectedPairDeps.find(ValuePair(J, I)); - OrigOrder = false; - } - - if (IJ != ConnectedPairDeps.end()) { - unsigned NumDepsDirect = 0, NumDepsSwap = 0; - for (std::vector::iterator T = IJ->second.begin(), - TE = IJ->second.end(); T != TE; ++T) { - VPPair Q(IJ->first, *T); - DenseMap::iterator R = - PairConnectionTypes.find(VPPair(Q.second, Q.first)); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - if (R->second == PairConnectionDirect) - ++NumDepsDirect; - else if (R->second == PairConnectionSwap) - ++NumDepsSwap; - } - - if (!OrigOrder) - std::swap(NumDepsDirect, NumDepsSwap); - - if (NumDepsSwap > NumDepsDirect) { - FlipPairOrder = true; - DEBUG(dbgs() << "BBV: reordering pair: " << *I << - " <-> " << *J << "\n"); - } - } - } - - Instruction *L = I, *H = J; - if (FlipPairOrder) - std::swap(H, L); - - // If the pair being fused uses the opposite order from that in the pair - // connection map, then we need to flip the types. - DenseMap >::iterator HL = - ConnectedPairs.find(ValuePair(H, L)); - if (HL != ConnectedPairs.end()) - for (std::vector::iterator T = HL->second.begin(), - TE = HL->second.end(); T != TE; ++T) { - VPPair Q(HL->first, *T); - DenseMap::iterator R = PairConnectionTypes.find(Q); - assert(R != PairConnectionTypes.end() && - "Cannot find pair connection type"); - if (R->second == PairConnectionDirect) - R->second = PairConnectionSwap; - else if (R->second == PairConnectionSwap) - R->second = PairConnectionDirect; - } - - bool LBeforeH = !FlipPairOrder; - unsigned NumOperands = I->getNumOperands(); - SmallVector ReplacedOperands(NumOperands); - getReplacementInputsForPair(Context, L, H, ReplacedOperands, - LBeforeH); - - // Make a copy of the original operation, change its type to the vector - // type and replace its operands with the vector operands. - Instruction *K = L->clone(); - if (L->hasName()) - K->takeName(L); - else if (H->hasName()) - K->takeName(H); - - if (auto CS = CallSite(K)) { - SmallVector Tys; - FunctionType *Old = CS.getFunctionType(); - unsigned NumOld = Old->getNumParams(); - assert(NumOld <= ReplacedOperands.size()); - for (unsigned i = 0; i != NumOld; ++i) - Tys.push_back(ReplacedOperands[i]->getType()); - CS.mutateFunctionType( - FunctionType::get(getVecTypeForPair(L->getType(), H->getType()), - Tys, Old->isVarArg())); - } else if (!isa(K)) - K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - - unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_group}; - combineMetadata(K, H, KnownIDs); - K->andIRFlags(H); - - for (unsigned o = 0; o < NumOperands; ++o) - K->setOperand(o, ReplacedOperands[o]); - - K->insertAfter(J); - - // Instruction insertion point: - Instruction *InsertionPt = K; - Instruction *K1 = nullptr, *K2 = nullptr; - replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); - - // The use dag of the first original instruction must be moved to after - // the location of the second instruction. The entire use dag of the - // first instruction is disjoint from the input dag of the second - // (by definition), and so commutes with it. - - moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); - - if (!isa(I)) { - L->replaceAllUsesWith(K1); - H->replaceAllUsesWith(K2); - } - - // Instructions that may read from memory may be in the load move set. - // Once an instruction is fused, we no longer need its move set, and so - // the values of the map never need to be updated. However, when a load - // is fused, we need to merge the entries from both instructions in the - // pair in case those instructions were in the move set of some other - // yet-to-be-fused pair. The loads in question are the keys of the map. - if (I->mayReadFromMemory()) { - std::vector NewSetMembers; - DenseMap >::iterator II = - LoadMoveSet.find(I); - if (II != LoadMoveSet.end()) - for (std::vector::iterator N = II->second.begin(), - NE = II->second.end(); N != NE; ++N) - NewSetMembers.push_back(ValuePair(K, *N)); - DenseMap >::iterator JJ = - LoadMoveSet.find(J); - if (JJ != LoadMoveSet.end()) - for (std::vector::iterator N = JJ->second.begin(), - NE = JJ->second.end(); N != NE; ++N) - NewSetMembers.push_back(ValuePair(K, *N)); - for (std::vector::iterator A = NewSetMembers.begin(), - AE = NewSetMembers.end(); A != AE; ++A) { - LoadMoveSet[A->first].push_back(A->second); - LoadMoveSetPairs.insert(*A); - } - } - - // Before removing I, set the iterator to the next instruction. - PI = std::next(BasicBlock::iterator(I)); - if (cast(PI) == J) - ++PI; - - SE->forgetValue(I); - SE->forgetValue(J); - I->eraseFromParent(); - J->eraseFromParent(); - - DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << - BB << "\n"); - } - - DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); - } -} - -char BBVectorize::ID = 0; -static const char bb_vectorize_name[] = "Basic-Block Vectorization"; -INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) -INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) - -BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { - return new BBVectorize(C); -} - -bool -llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { - BBVectorize BBVectorizer(P, *BB.getParent(), C); - return BBVectorizer.vectorizeBB(BB); -} - -//===----------------------------------------------------------------------===// -VectorizeConfig::VectorizeConfig() { - VectorBits = ::VectorBits; - VectorizeBools = !::NoBools; - VectorizeInts = !::NoInts; - VectorizeFloats = !::NoFloats; - VectorizePointers = !::NoPointers; - VectorizeCasts = !::NoCasts; - VectorizeMath = !::NoMath; - VectorizeBitManipulations = !::NoBitManipulation; - VectorizeFMA = !::NoFMA; - VectorizeSelect = !::NoSelect; - VectorizeCmp = !::NoCmp; - VectorizeGEP = !::NoGEP; - VectorizeMemOps = !::NoMemOps; - AlignedOnly = ::AlignedOnly; - ReqChainDepth= ::ReqChainDepth; - SearchLimit = ::SearchLimit; - MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; - SplatBreaksChain = ::SplatBreaksChain; - MaxInsts = ::MaxInsts; - MaxPairs = ::MaxPairs; - MaxIter = ::MaxIter; - Pow2LenOnly = ::Pow2LenOnly; - NoMemOpBoost = ::NoMemOpBoost; - FastDep = ::FastDep; -} diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 395f440bda4..1aea73cd4a3 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,5 +1,4 @@ add_llvm_library(LLVMVectorize - BBVectorize.cpp LoadStoreVectorizer.cpp LoopVectorize.cpp SLPVectorizer.cpp diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index a2192831788..fb2f509dcba 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -26,7 +26,6 @@ using namespace llvm; /// initializeVectorizationPasses - Initialize all passes linked into the /// Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { - initializeBBVectorizePass(Registry); initializeLoopVectorizePass(Registry); initializeSLPVectorizerPass(Registry); initializeLoadStoreVectorizerPass(Registry); @@ -36,8 +35,8 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) { initializeVectorization(*unwrap(R)); } +// DEPRECATED: Remove after the LLVM 5 release. void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createBBVectorizePass()); } void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { diff --git a/test/Feature/optnone-opt.ll b/test/Feature/optnone-opt.ll index efd35e56603..6410afb6be9 100644 --- a/test/Feature/optnone-opt.ll +++ b/test/Feature/optnone-opt.ll @@ -2,7 +2,7 @@ ; RUN: opt -O1 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 ; RUN: opt -O2 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 --check-prefix=OPT-O2O3 ; RUN: opt -O3 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 --check-prefix=OPT-O2O3 -; RUN: opt -bb-vectorize -dce -die -gvn-hoist -loweratomic -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-MORE +; RUN: opt -dce -die -gvn-hoist -loweratomic -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-MORE ; RUN: opt -indvars -licm -loop-deletion -loop-extract -loop-idiom -loop-instsimplify -loop-reduce -loop-reroll -loop-rotate -loop-unroll -loop-unswitch -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-LOOP ; REQUIRES: asserts @@ -55,7 +55,6 @@ attributes #0 = { optnone noinline } ; OPT-O2O3-DAG: Skipping pass 'SLP Vectorizer' ; Additional IR passes that opt doesn't turn on by default. -; OPT-MORE-DAG: Skipping pass 'Basic-Block Vectorization' ; OPT-MORE-DAG: Skipping pass 'Dead Code Elimination' ; OPT-MORE-DAG: Skipping pass 'Dead Instruction Elimination' ; OPT-MORE-DAG: Skipping pass 'Lower atomic intrinsics diff --git a/test/Transforms/BBVectorize/X86/cmp-types.ll b/test/Transforms/BBVectorize/X86/cmp-types.ll deleted file mode 100644 index fc1da1b0c60..00000000000 --- a/test/Transforms/BBVectorize/X86/cmp-types.ll +++ /dev/null @@ -1,16 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s - -%"struct.btSoftBody" = type { float, float, float*, i8 } - -define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 { -entry: - %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null - %cond16 = zext i1 %tobool15 to i32 - %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null - %cond22 = zext i1 %tobool21 to i32 - ret void -; CHECK-LABEL: @test1( -} - diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll deleted file mode 100644 index a533713609a..00000000000 --- a/test/Transforms/BBVectorize/X86/loop1.ll +++ /dev/null @@ -1,61 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL -; The second check covers the use of alias analysis (with loop unrolling). - -define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable { -entry: - br label %for.body -; CHECK-LABEL: @test1( -; CHECK-UNRL-LABEL: @test1( - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv - %0 = load double, double* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv - %1 = load double, double* %arrayidx2, align 8 - %mul = fmul double %0, %0 - %mul3 = fmul double %0, %1 - %add = fadd double %mul, %mul3 - %add4 = fadd double %1, %1 - %add5 = fadd double %add4, %0 - %mul6 = fmul double %0, %add5 - %add7 = fadd double %add, %mul6 - %mul8 = fmul double %1, %1 - %add9 = fadd double %0, %0 - %add10 = fadd double %add9, %0 - %mul11 = fmul double %mul8, %add10 - %add12 = fadd double %add7, %mul11 - %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv - store double %add12, double* %arrayidx14, align 8 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 10 - br i1 %exitcond, label %for.end, label %for.body -; CHECK: insertelement -; CHECK-NEXT: insertelement -; CHECK-NEXT: fadd <2 x double> -; CHECK-NEXT: insertelement -; CHECK-NEXT: shufflevector -; CHECK-NEXT: fadd <2 x double> -; CHECK-NEXT: insertelement -; CHECK-NEXT: fmul <2 x double> - -; CHECK-UNRL: %mul = fmul <2 x double> %2, %2 -; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3 -; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3 -; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3 -; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2 -; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5 -; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6 -; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3 -; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2 -; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2 -; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10 -; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11 - -for.end: ; preds = %for.body - ret void -} diff --git a/test/Transforms/BBVectorize/X86/pr15289.ll b/test/Transforms/BBVectorize/X86/pr15289.ll deleted file mode 100644 index a383a260faf..00000000000 --- a/test/Transforms/BBVectorize/X86/pr15289.ll +++ /dev/null @@ -1,95 +0,0 @@ -; RUN: opt < %s -basicaa -bb-vectorize -disable-output -; This is a bugpoint-reduced test case. It did not always assert, but does reproduce the bug -; and running under valgrind (or some similar tool) will catch the error. - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-apple-darwin12.2.0" - -%0 = type { [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }] } -%1 = type { [10 x [8 x i8]] } -%2 = type { i64, i64 } -%3 = type { [10 x i64], i64, i64, i64, i64, i64 } -%4 = type { i64, i64, i64, i64, i64, i64 } -%5 = type { [10 x i64] } -%6 = type { [10 x float], [10 x float], [10 x float], [10 x float] } -%struct.__st_parameter_dt.1.3.5.7 = type { %struct.__st_parameter_common.0.2.4.6, i64, i64*, i64*, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, [256 x i8], i32*, i64, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, [4 x i8] } -%struct.__st_parameter_common.0.2.4.6 = type { i32, i32, i8*, i32, i32, i8*, i32* } - -@cctenso_ = external unnamed_addr global %0, align 32 -@ctenso_ = external unnamed_addr global %1, align 32 -@i_dim_ = external unnamed_addr global %2, align 16 -@itenso1_ = external unnamed_addr global %3, align 32 -@itenso2_ = external unnamed_addr global %4, align 32 -@ltenso_ = external unnamed_addr global %5, align 32 -@rtenso_ = external unnamed_addr global %6, align 32 -@.cst = external unnamed_addr constant [8 x i8], align 8 -@.cst1 = external unnamed_addr constant [3 x i8], align 8 -@.cst2 = external unnamed_addr constant [29 x i8], align 8 -@.cst3 = external unnamed_addr constant [32 x i8], align 64 - -define void @cart_to_dc2y_(double* noalias nocapture %xx, double* noalias nocapture %yy, double* noalias nocapture %zz, [5 x { double, double }]* noalias nocapture %c2ten) nounwind uwtable { -entry: - %0 = fmul double undef, undef - %1 = fmul double undef, undef - %2 = fadd double undef, undef - %3 = fmul double undef, 0x3FE8B8B76E3E9919 - %4 = fsub double %0, %1 - %5 = fsub double -0.000000e+00, undef - %6 = fmul double undef, undef - %7 = fmul double %4, %6 - %8 = fmul double undef, 2.000000e+00 - %9 = fmul double %8, undef - %10 = fmul double undef, %9 - %11 = fmul double %10, undef - %12 = fsub double undef, %7 - %13 = fmul double %3, %12 - %14 = fmul double %3, undef - %15 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 0 - store double %13, double* %15, align 8 - %16 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 1 - %17 = fmul double undef, %8 - %18 = fmul double %17, undef - %19 = fmul double undef, %18 - %20 = fadd double undef, undef - %21 = fmul double %3, %19 - %22 = fsub double -0.000000e+00, %21 - %23 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 0 - store double %22, double* %23, align 8 - %24 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 1 - %25 = fmul double undef, 0x3FE42F601A8C6794 - %26 = fmul double undef, 2.000000e+00 - %27 = fsub double %26, %0 - %28 = fmul double %6, undef - %29 = fsub double undef, %28 - %30 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 0 - store double undef, double* %30, align 8 - %31 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 1 - %32 = fmul double undef, %17 - %33 = fmul double undef, %17 - %34 = fmul double undef, %32 - %35 = fmul double undef, %33 - %36 = fsub double undef, %35 - %37 = fmul double %3, %34 - %38 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 0 - store double %37, double* %38, align 8 - %39 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 1 - %40 = fmul double undef, %8 - %41 = fmul double undef, %40 - %42 = fmul double undef, %41 - %43 = fsub double undef, %42 - %44 = fmul double %3, %43 - %45 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 0 - store double %13, double* %45, align 8 - %46 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 1 - %47 = fsub double -0.000000e+00, %14 - store double %47, double* %16, align 8 - store double undef, double* %24, align 8 - store double -0.000000e+00, double* %31, align 8 - store double undef, double* %39, align 8 - store double undef, double* %46, align 8 - ret void -} - -attributes #0 = { nounwind uwtable } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind } diff --git a/test/Transforms/BBVectorize/X86/sh-rec.ll b/test/Transforms/BBVectorize/X86/sh-rec.ll deleted file mode 100644 index 2cb9dbded22..00000000000 --- a/test/Transforms/BBVectorize/X86/sh-rec.ll +++ /dev/null @@ -1,54 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s - -define void @ptoa() nounwind uwtable { -entry: - %call = call i8* @malloc() nounwind - br i1 undef, label %return, label %if.end10 - -if.end10: ; preds = %entry - %incdec.ptr = getelementptr inbounds i8, i8* %call, i64 undef - %call17 = call i32 @ptou() nounwind - %incdec.ptr26.1 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -2 - store i8 undef, i8* %incdec.ptr26.1, align 1 - %div27.1 = udiv i32 %call17, 100 - %rem.2 = urem i32 %div27.1, 10 - %add2230.2 = or i32 %rem.2, 48 - %conv25.2 = trunc i32 %add2230.2 to i8 - %incdec.ptr26.2 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -3 - store i8 %conv25.2, i8* %incdec.ptr26.2, align 1 - %incdec.ptr26.3 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -4 - store i8 undef, i8* %incdec.ptr26.3, align 1 - %div27.3 = udiv i32 %call17, 10000 - %rem.4 = urem i32 %div27.3, 10 - %add2230.4 = or i32 %rem.4, 48 - %conv25.4 = trunc i32 %add2230.4 to i8 - %incdec.ptr26.4 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -5 - store i8 %conv25.4, i8* %incdec.ptr26.4, align 1 - %div27.4 = udiv i32 %call17, 100000 - %rem.5 = urem i32 %div27.4, 10 - %add2230.5 = or i32 %rem.5, 48 - %conv25.5 = trunc i32 %add2230.5 to i8 - %incdec.ptr26.5 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -6 - store i8 %conv25.5, i8* %incdec.ptr26.5, align 1 - %incdec.ptr26.6 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -7 - store i8 0, i8* %incdec.ptr26.6, align 1 - %incdec.ptr26.7 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -8 - store i8 undef, i8* %incdec.ptr26.7, align 1 - %div27.7 = udiv i32 %call17, 100000000 - %rem.8 = urem i32 %div27.7, 10 - %add2230.8 = or i32 %rem.8, 48 - %conv25.8 = trunc i32 %add2230.8 to i8 - %incdec.ptr26.8 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -9 - store i8 %conv25.8, i8* %incdec.ptr26.8, align 1 - unreachable - -return: ; preds = %entry - ret void -; CHECK-LABEL: @ptoa( -} - -declare noalias i8* @malloc() nounwind - -declare i32 @ptou() diff --git a/test/Transforms/BBVectorize/X86/sh-rec2.ll b/test/Transforms/BBVectorize/X86/sh-rec2.ll deleted file mode 100644 index d7a004c2138..00000000000 --- a/test/Transforms/BBVectorize/X86/sh-rec2.ll +++ /dev/null @@ -1,85 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s - -%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 } - -define void @gsm_encode(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i8* %c) nounwind uwtable { -entry: - %xmc = alloca [52 x i16], align 16 - %arraydecay5 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 0 - call void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i16* undef, i16* null, i16* undef, i16* undef, i16* undef, i16* %arraydecay5) nounwind - %incdec.ptr136 = getelementptr inbounds i8, i8* %c, i64 10 - %incdec.ptr157 = getelementptr inbounds i8, i8* %c, i64 11 - store i8 0, i8* %incdec.ptr136, align 1 - %arrayidx162 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 11 - %0 = load i16, i16* %arrayidx162, align 2 - %conv1631 = trunc i16 %0 to i8 - %and164 = shl i8 %conv1631, 3 - %shl165 = and i8 %and164, 56 - %incdec.ptr172 = getelementptr inbounds i8, i8* %c, i64 12 - store i8 %shl165, i8* %incdec.ptr157, align 1 - %1 = load i16, i16* inttoptr (i64 2 to i16*), align 2 - %conv1742 = trunc i16 %1 to i8 - %and175 = shl i8 %conv1742, 1 - %incdec.ptr183 = getelementptr inbounds i8, i8* %c, i64 13 - store i8 %and175, i8* %incdec.ptr172, align 1 - %incdec.ptr199 = getelementptr inbounds i8, i8* %c, i64 14 - store i8 0, i8* %incdec.ptr183, align 1 - %arrayidx214 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 15 - %incdec.ptr220 = getelementptr inbounds i8, i8* %c, i64 15 - store i8 0, i8* %incdec.ptr199, align 1 - %2 = load i16, i16* %arrayidx214, align 2 - %conv2223 = trunc i16 %2 to i8 - %and223 = shl i8 %conv2223, 6 - %incdec.ptr235 = getelementptr inbounds i8, i8* %c, i64 16 - store i8 %and223, i8* %incdec.ptr220, align 1 - %arrayidx240 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 19 - %3 = load i16, i16* %arrayidx240, align 2 - %conv2414 = trunc i16 %3 to i8 - %and242 = shl i8 %conv2414, 2 - %shl243 = and i8 %and242, 28 - %incdec.ptr251 = getelementptr inbounds i8, i8* %c, i64 17 - store i8 %shl243, i8* %incdec.ptr235, align 1 - %incdec.ptr272 = getelementptr inbounds i8, i8* %c, i64 18 - store i8 0, i8* %incdec.ptr251, align 1 - %arrayidx282 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 25 - %4 = load i16, i16* %arrayidx282, align 2 - %conv2835 = trunc i16 %4 to i8 - %and284 = and i8 %conv2835, 7 - %incdec.ptr287 = getelementptr inbounds i8, i8* %c, i64 19 - store i8 %and284, i8* %incdec.ptr272, align 1 - %incdec.ptr298 = getelementptr inbounds i8, i8* %c, i64 20 - store i8 0, i8* %incdec.ptr287, align 1 - %incdec.ptr314 = getelementptr inbounds i8, i8* %c, i64 21 - store i8 0, i8* %incdec.ptr298, align 1 - %arrayidx319 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 26 - %5 = load i16, i16* %arrayidx319, align 4 - %conv3206 = trunc i16 %5 to i8 - %and321 = shl i8 %conv3206, 4 - %shl322 = and i8 %and321, 112 - %incdec.ptr335 = getelementptr inbounds i8, i8* %c, i64 22 - store i8 %shl322, i8* %incdec.ptr314, align 1 - %arrayidx340 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 29 - %6 = load i16, i16* %arrayidx340, align 2 - %conv3417 = trunc i16 %6 to i8 - %and342 = shl i8 %conv3417, 3 - %shl343 = and i8 %and342, 56 - %incdec.ptr350 = getelementptr inbounds i8, i8* %c, i64 23 - store i8 %shl343, i8* %incdec.ptr335, align 1 - %incdec.ptr366 = getelementptr inbounds i8, i8* %c, i64 24 - store i8 0, i8* %incdec.ptr350, align 1 - %arrayidx381 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 36 - %incdec.ptr387 = getelementptr inbounds i8, i8* %c, i64 25 - store i8 0, i8* %incdec.ptr366, align 1 - %7 = load i16, i16* %arrayidx381, align 8 - %conv3898 = trunc i16 %7 to i8 - %and390 = shl i8 %conv3898, 6 - store i8 %and390, i8* %incdec.ptr387, align 1 - unreachable -; CHECK-LABEL: @gsm_encode( -} - -declare void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352*, i16*, i16*, i16*, i16*, i16*, i16*, i16*) - -declare void @llvm.trap() noreturn nounwind diff --git a/test/Transforms/BBVectorize/X86/sh-rec3.ll b/test/Transforms/BBVectorize/X86/sh-rec3.ll deleted file mode 100644 index 2096deb08a9..00000000000 --- a/test/Transforms/BBVectorize/X86/sh-rec3.ll +++ /dev/null @@ -1,170 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s - -%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 } - -define void @gsm_encode(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i8* %c) nounwind uwtable { -entry: - %LARc28 = alloca [2 x i64], align 16 - %LARc28.sub = getelementptr inbounds [2 x i64], [2 x i64]* %LARc28, i64 0, i64 0 - %tmpcast = bitcast [2 x i64]* %LARc28 to [8 x i16]* - %Nc = alloca [4 x i16], align 2 - %Mc = alloca [4 x i16], align 2 - %bc = alloca [4 x i16], align 2 - %xmc = alloca [52 x i16], align 16 - %arraydecay = bitcast [2 x i64]* %LARc28 to i16* - %arraydecay1 = getelementptr inbounds [4 x i16], [4 x i16]* %Nc, i64 0, i64 0 - %arraydecay2 = getelementptr inbounds [4 x i16], [4 x i16]* %bc, i64 0, i64 0 - %arraydecay3 = getelementptr inbounds [4 x i16], [4 x i16]* %Mc, i64 0, i64 0 - %arraydecay5 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 0 - call void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i16* %arraydecay, i16* %arraydecay1, i16* %arraydecay2, i16* %arraydecay3, i16* undef, i16* %arraydecay5) nounwind - %0 = load i64, i64* %LARc28.sub, align 16 - %1 = trunc i64 %0 to i32 - %conv1 = lshr i32 %1, 2 - %and = and i32 %conv1, 15 - %or = or i32 %and, 208 - %conv6 = trunc i32 %or to i8 - %incdec.ptr = getelementptr inbounds i8, i8* %c, i64 1 - store i8 %conv6, i8* %c, align 1 - %conv84 = trunc i64 %0 to i8 - %and9 = shl i8 %conv84, 6 - %incdec.ptr15 = getelementptr inbounds i8, i8* %c, i64 2 - store i8 %and9, i8* %incdec.ptr, align 1 - %2 = lshr i64 %0, 50 - %shr226.tr = trunc i64 %2 to i8 - %conv25 = and i8 %shr226.tr, 7 - %incdec.ptr26 = getelementptr inbounds i8, i8* %c, i64 3 - store i8 %conv25, i8* %incdec.ptr15, align 1 - %incdec.ptr42 = getelementptr inbounds i8, i8* %c, i64 4 - store i8 0, i8* %incdec.ptr26, align 1 - %arrayidx52 = getelementptr inbounds [8 x i16], [8 x i16]* %tmpcast, i64 0, i64 7 - %3 = load i16, i16* %arrayidx52, align 2 - %conv537 = trunc i16 %3 to i8 - %and54 = and i8 %conv537, 7 - %incdec.ptr57 = getelementptr inbounds i8, i8* %c, i64 5 - store i8 %and54, i8* %incdec.ptr42, align 1 - %incdec.ptr68 = getelementptr inbounds i8, i8* %c, i64 6 - store i8 0, i8* %incdec.ptr57, align 1 - %4 = load i16, i16* %arraydecay3, align 2 - %conv748 = trunc i16 %4 to i8 - %and75 = shl i8 %conv748, 5 - %shl76 = and i8 %and75, 96 - %incdec.ptr84 = getelementptr inbounds i8, i8* %c, i64 7 - store i8 %shl76, i8* %incdec.ptr68, align 1 - %arrayidx94 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 1 - %5 = load i16, i16* %arrayidx94, align 2 - %conv959 = trunc i16 %5 to i8 - %and96 = shl i8 %conv959, 1 - %shl97 = and i8 %and96, 14 - %or103 = or i8 %shl97, 1 - %incdec.ptr105 = getelementptr inbounds i8, i8* %c, i64 8 - store i8 %or103, i8* %incdec.ptr84, align 1 - %arrayidx115 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 4 - %6 = bitcast i16* %arrayidx115 to i32* - %7 = load i32, i32* %6, align 8 - %conv11610 = trunc i32 %7 to i8 - %and117 = and i8 %conv11610, 7 - %incdec.ptr120 = getelementptr inbounds i8, i8* %c, i64 9 - store i8 %and117, i8* %incdec.ptr105, align 1 - %8 = lshr i32 %7, 16 - %and12330 = shl nuw nsw i32 %8, 5 - %and123 = trunc i32 %and12330 to i8 - %incdec.ptr136 = getelementptr inbounds i8, i8* %c, i64 10 - store i8 %and123, i8* %incdec.ptr120, align 1 - %incdec.ptr157 = getelementptr inbounds i8, i8* %c, i64 11 - store i8 0, i8* %incdec.ptr136, align 1 - %incdec.ptr172 = getelementptr inbounds i8, i8* %c, i64 12 - store i8 0, i8* %incdec.ptr157, align 1 - %arrayidx173 = getelementptr inbounds [4 x i16], [4 x i16]* %Nc, i64 0, i64 1 - %9 = load i16, i16* %arrayidx173, align 2 - %conv17412 = zext i16 %9 to i32 - %and175 = shl nuw nsw i32 %conv17412, 1 - %arrayidx177 = getelementptr inbounds [4 x i16], [4 x i16]* %bc, i64 0, i64 1 - %10 = load i16, i16* %arrayidx177, align 2 - %conv17826 = zext i16 %10 to i32 - %shr17913 = lshr i32 %conv17826, 1 - %and180 = and i32 %shr17913, 1 - %or181 = or i32 %and175, %and180 - %conv182 = trunc i32 %or181 to i8 - %incdec.ptr183 = getelementptr inbounds i8, i8* %c, i64 13 - store i8 %conv182, i8* %incdec.ptr172, align 1 - %arrayidx188 = getelementptr inbounds [4 x i16], [4 x i16]* %Mc, i64 0, i64 1 - %11 = load i16, i16* %arrayidx188, align 2 - %conv18914 = trunc i16 %11 to i8 - %and190 = shl i8 %conv18914, 5 - %shl191 = and i8 %and190, 96 - %incdec.ptr199 = getelementptr inbounds i8, i8* %c, i64 14 - store i8 %shl191, i8* %incdec.ptr183, align 1 - %arrayidx209 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 14 - %12 = load i16, i16* %arrayidx209, align 4 - %conv21015 = trunc i16 %12 to i8 - %and211 = shl i8 %conv21015, 1 - %shl212 = and i8 %and211, 14 - %or218 = or i8 %shl212, 1 - %incdec.ptr220 = getelementptr inbounds i8, i8* %c, i64 15 - store i8 %or218, i8* %incdec.ptr199, align 1 - %arrayidx225 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 16 - %13 = bitcast i16* %arrayidx225 to i64* - %14 = load i64, i64* %13, align 16 - %conv22616 = trunc i64 %14 to i8 - %and227 = shl i8 %conv22616, 3 - %shl228 = and i8 %and227, 56 - %incdec.ptr235 = getelementptr inbounds i8, i8* %c, i64 16 - store i8 %shl228, i8* %incdec.ptr220, align 1 - %15 = lshr i64 %14, 32 - %and23832 = shl nuw nsw i64 %15, 5 - %and238 = trunc i64 %and23832 to i8 - %incdec.ptr251 = getelementptr inbounds i8, i8* %c, i64 17 - store i8 %and238, i8* %incdec.ptr235, align 1 - %arrayidx266 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 23 - %incdec.ptr272 = getelementptr inbounds i8, i8* %c, i64 18 - store i8 0, i8* %incdec.ptr251, align 1 - %16 = load i16, i16* %arrayidx266, align 2 - %conv27418 = trunc i16 %16 to i8 - %and275 = shl i8 %conv27418, 6 - %incdec.ptr287 = getelementptr inbounds i8, i8* %c, i64 19 - store i8 %and275, i8* %incdec.ptr272, align 1 - %arrayidx288 = getelementptr inbounds [4 x i16], [4 x i16]* %Nc, i64 0, i64 2 - %17 = load i16, i16* %arrayidx288, align 2 - %conv28919 = zext i16 %17 to i32 - %and290 = shl nuw nsw i32 %conv28919, 1 - %arrayidx292 = getelementptr inbounds [4 x i16], [4 x i16]* %bc, i64 0, i64 2 - %18 = load i16, i16* %arrayidx292, align 2 - %conv29327 = zext i16 %18 to i32 - %shr29420 = lshr i32 %conv29327, 1 - %and295 = and i32 %shr29420, 1 - %or296 = or i32 %and290, %and295 - %conv297 = trunc i32 %or296 to i8 - %incdec.ptr298 = getelementptr inbounds i8, i8* %c, i64 20 - store i8 %conv297, i8* %incdec.ptr287, align 1 - %conv30021 = trunc i16 %18 to i8 - %and301 = shl i8 %conv30021, 7 - %incdec.ptr314 = getelementptr inbounds i8, i8* %c, i64 21 - store i8 %and301, i8* %incdec.ptr298, align 1 - %incdec.ptr335 = getelementptr inbounds i8, i8* %c, i64 22 - store i8 0, i8* %incdec.ptr314, align 1 - %arrayidx340 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 29 - %19 = load i16, i16* %arrayidx340, align 2 - %conv34122 = trunc i16 %19 to i8 - %and342 = shl i8 %conv34122, 3 - %shl343 = and i8 %and342, 56 - %incdec.ptr350 = getelementptr inbounds i8, i8* %c, i64 23 - store i8 %shl343, i8* %incdec.ptr335, align 1 - %arrayidx355 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 32 - %20 = bitcast i16* %arrayidx355 to i32* - %21 = load i32, i32* %20, align 16 - %conv35623 = shl i32 %21, 2 - %shl358 = and i32 %conv35623, 28 - %22 = lshr i32 %21, 17 - %and363 = and i32 %22, 3 - %or364 = or i32 %shl358, %and363 - %conv365 = trunc i32 %or364 to i8 - store i8 %conv365, i8* %incdec.ptr350, align 1 - unreachable -; CHECK-LABEL: @gsm_encode( -} - -declare void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565*, i16*, i16*, i16*, i16*, i16*, i16*, i16*) - -declare void @llvm.trap() noreturn nounwind diff --git a/test/Transforms/BBVectorize/X86/sh-types.ll b/test/Transforms/BBVectorize/X86/sh-types.ll deleted file mode 100644 index fbff2fb86eb..00000000000 --- a/test/Transforms/BBVectorize/X86/sh-types.ll +++ /dev/null @@ -1,25 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s - -define <4 x float> @test7(<4 x float> %A1, <4 x float> %B1, double %C1, double %C2, double %D1, double %D2) { - %A2 = shufflevector <4 x float> %A1, <4 x float> undef, <4 x i32> - %B2 = shufflevector <4 x float> %B1, <4 x float> undef, <4 x i32> - %X1 = shufflevector <4 x float> %A2, <4 x float> undef, <2 x i32> - %X2 = shufflevector <4 x float> %B2, <4 x float> undef, <2 x i32> - %Y1 = shufflevector <2 x float> %X1, <2 x float> undef, <4 x i32> - %Y2 = shufflevector <2 x float> %X2, <2 x float> undef, <4 x i32> - - %M1 = fsub double %C1, %D1 - %M2 = fsub double %C2, %D2 - %N1 = fmul double %M1, %C1 - %N2 = fmul double %M2, %C2 - %Z1 = fadd double %N1, %D1 - %Z2 = fadd double %N2, %D2 - - %R = fmul <4 x float> %Y1, %Y2 - ret <4 x float> %R -; CHECK-LABEL: @test7( -; CHECK-NOT: <8 x float> -; CHECK: ret <4 x float> -} - diff --git a/test/Transforms/BBVectorize/X86/simple-int.ll b/test/Transforms/BBVectorize/X86/simple-int.ll deleted file mode 100644 index ee5b5b3e4d0..00000000000 --- a/test/Transforms/BBVectorize/X86/simple-int.ll +++ /dev/null @@ -1,127 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" - -declare double @llvm.fma.f64(double, double, double) -declare double @llvm.fmuladd.f64(double, double, double) -declare double @llvm.cos.f64(double) -declare double @llvm.powi.f64(double, i32) - -; Basic depth-3 chain with fma -define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[Y1:%.*]] = call double @llvm.fma.f64(double [[X1]], double [[A1]], double [[C1:%.*]]) -; CHECK-NEXT: [[Y2:%.*]] = call double @llvm.fma.f64(double [[X2]], double [[A2]], double [[C2:%.*]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd double [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = fadd double [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1]], [[Z2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1) - %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with fmuladd -define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { -; CHECK-LABEL: @test1a( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1_V_I2_1:%.*]] = insertelement <2 x double> undef, double [[C1:%.*]], i32 0 -; CHECK-NEXT: [[Y1_V_I2_2:%.*]] = insertelement <2 x double> [[Y1_V_I2_1]], double [[C2:%.*]], i32 1 -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[X1]], <2 x double> [[X1_V_I0_2]], <2 x double> [[Y1_V_I2_2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1) - %Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with cos -define double @test2(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[Y1:%.*]] = call double @llvm.cos.f64(double [[X1]]) -; CHECK-NEXT: [[Y2:%.*]] = call double @llvm.cos.f64(double [[X2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd double [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = fadd double [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1]], [[Z2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.cos.f64(double %X1) - %Y2 = call double @llvm.cos.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with powi -define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[Y1:%.*]] = call double @llvm.powi.f64(double [[X1]], i32 [[P:%.*]]) -; CHECK-NEXT: [[Y2:%.*]] = call double @llvm.powi.f64(double [[X2]], i32 [[P]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd double [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = fadd double [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1]], [[Z2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.powi.f64(double %X1, i32 %P) - %Y2 = call double @llvm.powi.f64(double %X2, i32 %P) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with powi (different powers: should not vectorize) -define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[P2:%.*]] = add i32 [[P:%.*]], 1 -; CHECK-NEXT: [[Y1:%.*]] = call double @llvm.powi.f64(double [[X1]], i32 [[P]]) -; CHECK-NEXT: [[Y2:%.*]] = call double @llvm.powi.f64(double [[X2]], i32 [[P2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd double [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = fadd double [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1]], [[Z2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %P2 = add i32 %P, 1 - %Y1 = call double @llvm.powi.f64(double %X1, i32 %P) - %Y2 = call double @llvm.powi.f64(double %X2, i32 %P2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - diff --git a/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/test/Transforms/BBVectorize/X86/simple-ldstr.ll deleted file mode 100644 index a81d9638f5e..00000000000 --- a/test/Transforms/BBVectorize/X86/simple-ldstr.ll +++ /dev/null @@ -1,33 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" - -; Simple 3-pair chain with loads and stores -define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { -; CHECK-LABEL: @test1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[I0_V_I0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[I1_V_I0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[I0:%.*]] = load <2 x double>, <2 x double>* [[I0_V_I0]], align 8 -; CHECK-NEXT: [[I1:%.*]] = load <2 x double>, <2 x double>* [[I1_V_I0]], align 8 -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[I0]], [[I1]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[MUL]], <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: ret void -; -entry: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - store double %mul, double* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - ret void -} - diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll deleted file mode 100644 index 0f7ddffbd19..00000000000 --- a/test/Transforms/BBVectorize/X86/simple.ll +++ /dev/null @@ -1,149 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" - -; Basic depth-3 chain -define double @test1(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic chain -define double @test1a(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test1a( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[W1:%.*]] = fadd <2 x double> [[Y1]], [[Z1]] -; CHECK-NEXT: [[V1:%.*]] = fadd <2 x double> [[W1]], [[Z1]] -; CHECK-NEXT: [[Q1:%.*]] = fadd <2 x double> [[W1]], [[V1]] -; CHECK-NEXT: [[S1:%.*]] = fadd <2 x double> [[W1]], [[Q1]] -; CHECK-NEXT: [[S1_V_R1:%.*]] = extractelement <2 x double> [[S1]], i32 0 -; CHECK-NEXT: [[S1_V_R2:%.*]] = extractelement <2 x double> [[S1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[S1_V_R1]], [[S1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %W1 = fadd double %Y1, %Z1 - %W2 = fadd double %Y2, %Z2 - %V1 = fadd double %W1, %Z1 - %V2 = fadd double %W2, %Z2 - %Q1 = fadd double %W1, %V1 - %Q2 = fadd double %W2, %V2 - %S1 = fadd double %W1, %Q1 - %S2 = fadd double %W2, %Q2 - %R = fmul double %S1, %S2 - ret double %R -} - -; Basic depth-3 chain (last pair permuted) -define double @test2(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0 -; CHECK-NEXT: [[Z1_V_I1_2:%.*]] = insertelement <2 x double> [[Z1_V_I1_1]], double [[B1]], i32 1 -; CHECK-NEXT: [[Z2:%.*]] = fadd <2 x double> [[Y1]], [[Z1_V_I1_2]] -; CHECK-NEXT: [[Z2_V_R1:%.*]] = extractelement <2 x double> [[Z2]], i32 0 -; CHECK-NEXT: [[Z2_V_R2:%.*]] = extractelement <2 x double> [[Z2]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z2_V_R2]], [[Z2_V_R1]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y2, %B1 - %Z2 = fadd double %Y1, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-4 chain (internal permutation) -define double @test4(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0 -; CHECK-NEXT: [[Z1_V_I1_2:%.*]] = insertelement <2 x double> [[Z1_V_I1_1]], double [[B1]], i32 1 -; CHECK-NEXT: [[Z2:%.*]] = fadd <2 x double> [[Y1]], [[Z1_V_I1_2]] -; CHECK-NEXT: [[Z2_V_R1:%.*]] = extractelement <2 x double> [[Z2]], i32 0 -; CHECK-NEXT: [[Z2_V_R2:%.*]] = extractelement <2 x double> [[Z2]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z2_V_R2]], [[Z2_V_R1]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y2, %B1 - %Z2 = fadd double %Y1, %B2 - %W1 = fadd double %Y2, %Z1 - %W2 = fadd double %Y1, %Z2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic chain with shuffles -define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { -; CHECK-LABEL: @test6( -; CHECK-NEXT: [[X1:%.*]] = sub <8 x i8> [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = sub <8 x i8> [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[Y1:%.*]] = mul <8 x i8> [[X1]], [[A1]] -; CHECK-NEXT: [[Y2:%.*]] = mul <8 x i8> [[X2]], [[A2]] -; CHECK-NEXT: [[Z1:%.*]] = add <8 x i8> [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = add <8 x i8> [[Y2]], [[B2]] -; CHECK-NEXT: [[Q1:%.*]] = shufflevector <8 x i8> [[Z1]], <8 x i8> [[Z2]], <8 x i32> -; CHECK-NEXT: [[Q2:%.*]] = shufflevector <8 x i8> [[Z2]], <8 x i8> undef, <8 x i32> -; CHECK-NEXT: [[R:%.*]] = mul <8 x i8> [[Q1]], [[Q2]] -; CHECK-NEXT: ret <8 x i8> [[R]] -; - %X1 = sub <8 x i8> %A1, %B1 - %X2 = sub <8 x i8> %A2, %B2 - %Y1 = mul <8 x i8> %X1, %A1 - %Y2 = mul <8 x i8> %X2, %A2 - %Z1 = add <8 x i8> %Y1, %B1 - %Z2 = add <8 x i8> %Y2, %B2 - %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> - %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> - %R = mul <8 x i8> %Q1, %Q2 - ret <8 x i8> %R -} - diff --git a/test/Transforms/BBVectorize/X86/vs-cast.ll b/test/Transforms/BBVectorize/X86/vs-cast.ll deleted file mode 100644 index 297f2d5a7b3..00000000000 --- a/test/Transforms/BBVectorize/X86/vs-cast.ll +++ /dev/null @@ -1,19 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @main() nounwind uwtable { -; CHECK-LABEL: @main( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> undef to i128 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> undef to i128 -; CHECK-NEXT: ret void -; -entry: - %0 = bitcast <2 x i64> undef to i128 - %1 = bitcast <2 x i64> undef to i128 - ret void -} - diff --git a/test/Transforms/BBVectorize/X86/wr-aliases.ll b/test/Transforms/BBVectorize/X86/wr-aliases.ll deleted file mode 100644 index e34414988f3..00000000000 --- a/test/Transforms/BBVectorize/X86/wr-aliases.ll +++ /dev/null @@ -1,144 +0,0 @@ -; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -disable-basicaa -bb-vectorize -S < %s | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%class.QBezier.15 = type { double, double, double, double, double, double, double, double } - -; Function Attrs: nounwind -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0 - -; Function Attrs: uwtable -declare fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval nocapture readonly align 8) #1 - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #0 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 - -define void @main_arrayctor.cont([10 x %class.QBezier.15]* %beziers, %class.QBezier.15* %agg.tmp.i, %class.QBezier.15* %agg.tmp55.i, %class.QBezier.15* %agg.tmp56.i) { -newFuncRoot: - br label %arrayctor.cont - -arrayctor.cont.ret.exitStub: ; preds = %arrayctor.cont - ret void - -; CHECK-LABEL: @main_arrayctor.cont -; CHECK: <2 x double> -; CHECK: @_ZL12printQBezier7QBezier -; CHECK: store double %mul8.i, double* %x3.i, align 16 -; CHECK: load double, double* %x3.i, align 16 -; CHECK: ret - -arrayctor.cont: ; preds = %newFuncRoot - %ref.tmp.sroa.0.0.idx = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 0 - store double 1.000000e+01, double* %ref.tmp.sroa.0.0.idx, align 16 - %ref.tmp.sroa.2.0.idx1 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 1 - store double 2.000000e+01, double* %ref.tmp.sroa.2.0.idx1, align 8 - %ref.tmp.sroa.3.0.idx2 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 2 - store double 3.000000e+01, double* %ref.tmp.sroa.3.0.idx2, align 16 - %ref.tmp.sroa.4.0.idx3 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 3 - store double 4.000000e+01, double* %ref.tmp.sroa.4.0.idx3, align 8 - %ref.tmp.sroa.5.0.idx4 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 4 - store double 5.000000e+01, double* %ref.tmp.sroa.5.0.idx4, align 16 - %ref.tmp.sroa.6.0.idx5 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 5 - store double 6.000000e+01, double* %ref.tmp.sroa.6.0.idx5, align 8 - %ref.tmp.sroa.7.0.idx6 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 6 - store double 7.000000e+01, double* %ref.tmp.sroa.7.0.idx6, align 16 - %ref.tmp.sroa.8.0.idx7 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 7 - store double 8.000000e+01, double* %ref.tmp.sroa.8.0.idx7, align 8 - %add.ptr = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1 - %v0 = bitcast %class.QBezier.15* %agg.tmp.i to i8* - call void @llvm.lifetime.start(i64 64, i8* %v0) - %v1 = bitcast %class.QBezier.15* %agg.tmp55.i to i8* - call void @llvm.lifetime.start(i64 64, i8* %v1) - %v2 = bitcast %class.QBezier.15* %agg.tmp56.i to i8* - call void @llvm.lifetime.start(i64 64, i8* %v2) - %v3 = bitcast [10 x %class.QBezier.15]* %beziers to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v0, i8* %v3, i64 64, i32 8, i1 false) - call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp.i) - %x2.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 2 - %v4 = load double, double* %x2.i, align 16 - %x3.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 4 - %v5 = load double, double* %x3.i, align 16 - %add.i = fadd double %v4, %v5 - %mul.i = fmul double 5.000000e-01, %add.i - %x1.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 0 - %v6 = load double, double* %x1.i, align 16 - %add3.i = fadd double %v4, %v6 - %mul4.i = fmul double 5.000000e-01, %add3.i - %x25.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 2 - store double %mul4.i, double* %x25.i, align 16 - %v7 = load double, double* %x3.i, align 16 - %x4.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 6 - %v8 = load double, double* %x4.i, align 16 - %add7.i = fadd double %v7, %v8 - %mul8.i = fmul double 5.000000e-01, %add7.i - store double %mul8.i, double* %x3.i, align 16 - %v9 = load double, double* %x1.i, align 16 - %x111.i = getelementptr inbounds %class.QBezier.15, %class.QBezier.15* %add.ptr, i64 0, i32 0 - store double %v9, double* %x111.i, align 16 - %v10 = load double, double* %x25.i, align 16 - %add15.i = fadd double %mul.i, %v10 - %mul16.i = fmul double 5.000000e-01, %add15.i - %x317.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 4 - store double %mul16.i, double* %x317.i, align 16 - %v11 = load double, double* %x3.i, align 16 - %add19.i = fadd double %mul.i, %v11 - %mul20.i = fmul double 5.000000e-01, %add19.i - store double %mul20.i, double* %x2.i, align 16 - %v12 = load double, double* %x317.i, align 16 - %add24.i = fadd double %v12, %mul20.i - %mul25.i = fmul double 5.000000e-01, %add24.i - store double %mul25.i, double* %x1.i, align 16 - %x427.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 6 - store double %mul25.i, double* %x427.i, align 16 - %y2.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 3 - %v13 = load double, double* %y2.i, align 8 - %y3.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 5 - %v14 = load double, double* %y3.i, align 8 - %add28.i = fadd double %v13, %v14 - %div.i = fmul double 5.000000e-01, %add28.i - %y1.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 1 - %v15 = load double, double* %y1.i, align 8 - %add30.i = fadd double %v13, %v15 - %mul31.i = fmul double 5.000000e-01, %add30.i - %y232.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 3 - store double %mul31.i, double* %y232.i, align 8 - %v16 = load double, double* %y3.i, align 8 - %y4.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 7 - %v17 = load double, double* %y4.i, align 8 - %add34.i = fadd double %v16, %v17 - %mul35.i = fmul double 5.000000e-01, %add34.i - store double %mul35.i, double* %y3.i, align 8 - %v18 = load double, double* %y1.i, align 8 - %y138.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 1 - store double %v18, double* %y138.i, align 8 - %v19 = load double, double* %y232.i, align 8 - %add42.i = fadd double %div.i, %v19 - %mul43.i = fmul double 5.000000e-01, %add42.i - %y344.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 5 - store double %mul43.i, double* %y344.i, align 8 - %v20 = load double, double* %y3.i, align 8 - %add46.i = fadd double %div.i, %v20 - %mul47.i = fmul double 5.000000e-01, %add46.i - store double %mul47.i, double* %y2.i, align 8 - %v21 = load double, double* %y344.i, align 8 - %add51.i = fadd double %v21, %mul47.i - %mul52.i = fmul double 5.000000e-01, %add51.i - store double %mul52.i, double* %y1.i, align 8 - %y454.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 7 - store double %mul52.i, double* %y454.i, align 8 - %v22 = bitcast %class.QBezier.15* %add.ptr to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v1, i8* %v22, i64 64, i32 8, i1 false) - call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp55.i) - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v2, i8* %v3, i64 64, i32 8, i1 false) - call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp56.i) - call void @llvm.lifetime.end.p0i8(i64 64, i8* %v0) - call void @llvm.lifetime.end.p0i8(i64 64, i8* %v1) - call void @llvm.lifetime.end.p0i8(i64 64, i8* %v2) - br label %arrayctor.cont.ret.exitStub -} - -attributes #0 = { nounwind } -attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll deleted file mode 100644 index 6bfa625ea5f..00000000000 --- a/test/Transforms/BBVectorize/cycle.ll +++ /dev/null @@ -1,112 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s - -; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise -; want to select the pairs: -; %div77 = fdiv double %sub74, %mul76.v.r1 <-> %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117) -; %add84 = fadd double %sub83, 2.000000e+00 <-> %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77) -; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <-> %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84) -; %mul117 = fmul double %sub39.v.r1, %sub116 <-> %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95) -; and so a dependency cycle would be created. - -declare double @fabs(double) nounwind readnone -define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) { -entry: - br label %go -go: - %conv = sitofp i32 %n.0 to double - %add35 = fadd double %conv, %a - %sub36 = fadd double %add35, -1.000000e+00 - %add38 = fadd double %conv, %b - %sub39 = fadd double %add38, -1.000000e+00 - %add41 = fadd double %conv, %c - %sub42 = fadd double %add41, -1.000000e+00 - %sub45 = fadd double %add35, -2.000000e+00 - %sub48 = fadd double %add38, -2.000000e+00 - %sub51 = fadd double %add41, -2.000000e+00 - %mul52 = shl nsw i32 %n.0, 1 - %sub53 = add nsw i32 %mul52, -1 - %conv54 = sitofp i32 %sub53 to double - %sub56 = add nsw i32 %mul52, -3 - %conv57 = sitofp i32 %sub56 to double - %sub59 = add nsw i32 %mul52, -5 - %conv60 = sitofp i32 %sub59 to double - %mul61 = mul nsw i32 %n.0, %n.0 - %conv62 = sitofp i32 %mul61 to double - %mul63 = fmul double %conv62, 3.000000e+00 - %mul67 = fmul double %sub65, %conv - %add68 = fadd double %mul63, %mul67 - %add69 = fadd double %add68, 2.000000e+00 - %sub71 = fsub double %add69, %mul2.v.r1 - %sub74 = fsub double %sub71, %mul73 - %mul75 = fmul double %conv57, 2.000000e+00 - %mul76 = fmul double %mul75, %sub42 - %div77 = fdiv double %sub74, %mul76 - %mul82 = fmul double %add80, %conv - %sub83 = fsub double %mul63, %mul82 - %add84 = fadd double %sub83, 2.000000e+00 - %sub86 = fsub double %add84, %mul2.v.r1 - %sub87 = fsub double -0.000000e+00, %sub86 - %mul88 = fmul double %sub36, %sub87 - %mul89 = fmul double %mul88, %sub39 - %mul90 = fmul double %conv54, 4.000000e+00 - %mul91 = fmul double %mul90, %conv57 - %mul92 = fmul double %mul91, %sub51 - %mul93 = fmul double %mul92, %sub42 - %div94 = fdiv double %mul89, %mul93 - %mul95 = fmul double %sub45, %sub36 - %mul96 = fmul double %mul95, %sub48 - %mul97 = fmul double %mul96, %sub39 - %sub99 = fsub double %conv, %a - %sub100 = fadd double %sub99, -2.000000e+00 - %mul101 = fmul double %mul97, %sub100 - %sub103 = fsub double %conv, %b - %sub104 = fadd double %sub103, -2.000000e+00 - %mul105 = fmul double %mul101, %sub104 - %mul106 = fmul double %conv57, 8.000000e+00 - %mul107 = fmul double %mul106, %conv57 - %mul108 = fmul double %mul107, %conv60 - %sub111 = fadd double %add41, -3.000000e+00 - %mul112 = fmul double %mul108, %sub111 - %mul113 = fmul double %mul112, %sub51 - %mul114 = fmul double %mul113, %sub42 - %div115 = fdiv double %mul105, %mul114 - %sub116 = fsub double -0.000000e+00, %sub36 - %mul117 = fmul double %sub39, %sub116 - %sub119 = fsub double %conv, %c - %sub120 = fadd double %sub119, -1.000000e+00 - %mul121 = fmul double %mul117, %sub120 - %mul123 = fmul double %mul75, %sub51 - %mul124 = fmul double %mul123, %sub42 - %div125 = fdiv double %mul121, %mul124 - %mul126 = fmul double %div77, %sub - %add127 = fadd double %mul126, 1.000000e+00 - %mul128 = fmul double %add127, %Anm1.0 - %mul129 = fmul double %div94, %sub - %add130 = fadd double %div125, %mul129 - %mul131 = fmul double %add130, %sub - %mul132 = fmul double %mul131, %Anm2.0 - %add133 = fadd double %mul128, %mul132 - %mul134 = fmul double %div115, %mul1 - %mul135 = fmul double %mul134, %Anm3.0 - %add136 = fadd double %add133, %mul135 - %mul139 = fmul double %add127, %Bnm1.0 - %mul143 = fmul double %mul131, %Bnm2.0 - %add144 = fadd double %mul139, %mul143 - %mul146 = fmul double %mul134, %Bnm3.0 - %add147 = fadd double %add144, %mul146 - %div148 = fdiv double %add136, %add147 - %sub149 = fsub double %F.0, %div148 - %div150 = fdiv double %sub149, %F.0 - %call = tail call double @fabs(double %div150) nounwind readnone - %cmp = fcmp olt double %call, 0x3CB0000000000000 - %cmp152 = icmp sgt i32 %n.0, 20000 - %or.cond = or i1 %cmp, %cmp152 - br i1 %or.cond, label %done, label %go -done: - ret void -; CHECK-LABEL: @test1( -; CHECK: go: -; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0 -; FIXME: When tree pruning is deterministic, include the entire output. -} diff --git a/test/Transforms/BBVectorize/func-alias.ll b/test/Transforms/BBVectorize/func-alias.ll deleted file mode 100644 index ab72ec0e199..00000000000 --- a/test/Transforms/BBVectorize/func-alias.ll +++ /dev/null @@ -1,244 +0,0 @@ -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -basicaa -bb-vectorize -bb-vectorize-req-chain-depth=2 -instcombine -gvn -S | FileCheck %s -; The chain length is set to 2 so that this will do some vectorization; check that the order of the functions is unchanged. - -%struct.descriptor_dimension = type { i64, i64, i64 } -%struct.__st_parameter_common = type { i32, i32, i8*, i32, i32, i8*, i32* } -%struct.__st_parameter_dt = type { %struct.__st_parameter_common, i64, i64*, i64*, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, [256 x i8], i32*, i64, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, [4 x i8] } -%"struct.array4_real(kind=4)" = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] } -%"struct.array4_integer(kind=4).73" = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] } -%struct.array4_unknown = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] } - -@.cst4 = external unnamed_addr constant [11 x i8], align 8 -@.cst823 = external unnamed_addr constant [214 x i8], align 64 -@j.4580 = external global i32 -@j1.4581 = external global i32 -@nty1.4590 = external global [2 x i8] -@nty2.4591 = external global [2 x i8] -@xr1.4592 = external global float -@xr2.4593 = external global float -@yr1.4594 = external global float -@yr2.4595 = external global float - -@__main1_MOD_iave = external unnamed_addr global i32 -@__main1_MOD_igrp = external global i32 -@__main1_MOD_iounit = external global i32 -@__main1_MOD_ityp = external global i32 -@__main1_MOD_mclmsg = external unnamed_addr global %struct.array4_unknown, align 32 -@__main1_MOD_mxdate = external unnamed_addr global %"struct.array4_integer(kind=4).73", align 32 -@__main1_MOD_rmxval = external unnamed_addr global %"struct.array4_real(kind=4)", align 32 - -declare void @_gfortran_st_write(%struct.__st_parameter_dt*) -declare void @_gfortran_st_write_done(%struct.__st_parameter_dt*) -declare void @_gfortran_transfer_character_write(%struct.__st_parameter_dt*, i8*, i32) -declare void @_gfortran_transfer_integer_write(%struct.__st_parameter_dt*, i8*, i32) -declare void @_gfortran_transfer_real_write(%struct.__st_parameter_dt*, i8*, i32) - -define i1 @"prtmax___"(%struct.__st_parameter_dt* %memtmp3, i32 %D.4627_188.reload) nounwind { -; CHECK: prtmax__ -newFuncRoot: - br label %"" - -codeRepl80.exitStub: ; preds = %"" - ret i1 true - -"._crit_edge.exitStub": ; preds = %"" - ret i1 false - -"": ; preds = %newFuncRoot - %tmp128 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0 - %tmp129 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp128, i32 0, i32 2 - store i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.cst4, i64 0, i64 0), i8** %tmp129, align 8 - %tmp130 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0 - %tmp131 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp130, i32 0, i32 3 - store i32 31495, i32* %tmp131, align 4 - %tmp132 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 5 - store i8* getelementptr inbounds ([214 x i8], [214 x i8]* @.cst823, i64 0, i64 0), i8** %tmp132, align 8 - %tmp133 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 6 - store i32 214, i32* %tmp133, align 4 - %tmp134 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0 - %tmp135 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp134, i32 0, i32 0 - store i32 4096, i32* %tmp135, align 4 - %iounit.8748_288 = load i32, i32* @__main1_MOD_iounit, align 4 - %tmp136 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0 - %tmp137 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp136, i32 0, i32 1 - store i32 %iounit.8748_288, i32* %tmp137, align 4 - call void @_gfortran_st_write(%struct.__st_parameter_dt* %memtmp3) nounwind - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* @j.4580, i32 4) nounwind -; CHECK: @_gfortran_transfer_integer_write - %D.75807_289 = load i8*, i8** getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 0), align 8 - %j.8758_290 = load i32, i32* @j.4580, align 4 - %D.75760_291 = sext i32 %j.8758_290 to i64 - %iave.8736_292 = load i32, i32* @__main1_MOD_iave, align 4 - %D.75620_293 = sext i32 %iave.8736_292 to i64 - %D.75808_294 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 2, i32 0), align 8 - %D.75809_295 = mul nsw i64 %D.75620_293, %D.75808_294 - %igrp.8737_296 = load i32, i32* @__main1_MOD_igrp, align 4 - %D.75635_297 = sext i32 %igrp.8737_296 to i64 - %D.75810_298 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 1, i32 0), align 8 - %D.75811_299 = mul nsw i64 %D.75635_297, %D.75810_298 - %D.75812_300 = add nsw i64 %D.75809_295, %D.75811_299 - %D.75813_301 = add nsw i64 %D.75760_291, %D.75812_300 - %ityp.8750_302 = load i32, i32* @__main1_MOD_ityp, align 4 - %D.75704_303 = sext i32 %ityp.8750_302 to i64 - %D.75814_304 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 3, i32 0), align 8 - %D.75815_305 = mul nsw i64 %D.75704_303, %D.75814_304 - %D.75816_306 = add nsw i64 %D.75813_301, %D.75815_305 - %D.75817_307 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 1), align 8 - %D.75818_308 = add nsw i64 %D.75816_306, %D.75817_307 - %tmp138 = bitcast i8* %D.75807_289 to [0 x float]* - %tmp139 = bitcast [0 x float]* %tmp138 to float* - %D.75819_309 = getelementptr inbounds float, float* %tmp139, i64 %D.75818_308 - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* %D.75819_309, i32 4) nounwind -; CHECK: @_gfortran_transfer_real_write - %D.75820_310 = load i8*, i8** getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 0), align 8 - %j.8758_311 = load i32, i32* @j.4580, align 4 - %D.75760_312 = sext i32 %j.8758_311 to i64 - %iave.8736_313 = load i32, i32* @__main1_MOD_iave, align 4 - %D.75620_314 = sext i32 %iave.8736_313 to i64 - %D.75821_315 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 2, i32 0), align 8 - %D.75822_316 = mul nsw i64 %D.75620_314, %D.75821_315 - %igrp.8737_317 = load i32, i32* @__main1_MOD_igrp, align 4 - %D.75635_318 = sext i32 %igrp.8737_317 to i64 - %D.75823_319 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 1, i32 0), align 8 - %D.75824_320 = mul nsw i64 %D.75635_318, %D.75823_319 - %D.75825_321 = add nsw i64 %D.75822_316, %D.75824_320 - %D.75826_322 = add nsw i64 %D.75760_312, %D.75825_321 - %ityp.8750_323 = load i32, i32* @__main1_MOD_ityp, align 4 - %D.75704_324 = sext i32 %ityp.8750_323 to i64 - %D.75827_325 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 3, i32 0), align 8 - %D.75828_326 = mul nsw i64 %D.75704_324, %D.75827_325 - %D.75829_327 = add nsw i64 %D.75826_322, %D.75828_326 - %D.75830_328 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 1), align 8 - %D.75831_329 = add nsw i64 %D.75829_327, %D.75830_328 - %tmp140 = bitcast i8* %D.75820_310 to [0 x [1 x i8]]* - %tmp141 = bitcast [0 x [1 x i8]]* %tmp140 to [1 x i8]* - %D.75832_330 = getelementptr inbounds [1 x i8], [1 x i8]* %tmp141, i64 %D.75831_329 - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [1 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [1 x i8]* %D.75832_330, i32 1) nounwind -; CHECK: @_gfortran_transfer_character_write - %D.75833_331 = load i8*, i8** getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 0), align 8 - %j.8758_332 = load i32, i32* @j.4580, align 4 - %D.75760_333 = sext i32 %j.8758_332 to i64 - %iave.8736_334 = load i32, i32* @__main1_MOD_iave, align 4 - %D.75620_335 = sext i32 %iave.8736_334 to i64 - %D.75834_336 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 2, i32 0), align 8 - %D.75835_337 = mul nsw i64 %D.75620_335, %D.75834_336 - %igrp.8737_338 = load i32, i32* @__main1_MOD_igrp, align 4 - %D.75635_339 = sext i32 %igrp.8737_338 to i64 - %D.75836_340 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 1, i32 0), align 8 - %D.75837_341 = mul nsw i64 %D.75635_339, %D.75836_340 - %D.75838_342 = add nsw i64 %D.75835_337, %D.75837_341 - %D.75839_343 = add nsw i64 %D.75760_333, %D.75838_342 - %ityp.8750_344 = load i32, i32* @__main1_MOD_ityp, align 4 - %D.75704_345 = sext i32 %ityp.8750_344 to i64 - %D.75840_346 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 3, i32 0), align 8 - %D.75841_347 = mul nsw i64 %D.75704_345, %D.75840_346 - %D.75842_348 = add nsw i64 %D.75839_343, %D.75841_347 - %D.75843_349 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 1), align 8 - %D.75844_350 = add nsw i64 %D.75842_348, %D.75843_349 - %tmp142 = bitcast i8* %D.75833_331 to [0 x i32]* - %tmp143 = bitcast [0 x i32]* %tmp142 to i32* - %D.75845_351 = getelementptr inbounds i32, i32* %tmp143, i64 %D.75844_350 - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* %D.75845_351, i32 4) nounwind -; CHECK: @_gfortran_transfer_integer_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @xr1.4592, i32 4) nounwind -; CHECK: @_gfortran_transfer_real_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @yr1.4594, i32 4) nounwind -; CHECK: @_gfortran_transfer_real_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [2 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [2 x i8]* @nty1.4590, i32 2) nounwind -; CHECK: @_gfortran_transfer_character_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* @j1.4581, i32 4) nounwind -; CHECK: @_gfortran_transfer_integer_write - %D.75807_352 = load i8*, i8** getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 0), align 8 - %j1.8760_353 = load i32, i32* @j1.4581, align 4 - %D.75773_354 = sext i32 %j1.8760_353 to i64 - %iave.8736_355 = load i32, i32* @__main1_MOD_iave, align 4 - %D.75620_356 = sext i32 %iave.8736_355 to i64 - %D.75808_357 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 2, i32 0), align 8 - %D.75809_358 = mul nsw i64 %D.75620_356, %D.75808_357 - %igrp.8737_359 = load i32, i32* @__main1_MOD_igrp, align 4 - %D.75635_360 = sext i32 %igrp.8737_359 to i64 - %D.75810_361 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 1, i32 0), align 8 - %D.75811_362 = mul nsw i64 %D.75635_360, %D.75810_361 - %D.75812_363 = add nsw i64 %D.75809_358, %D.75811_362 - %D.75846_364 = add nsw i64 %D.75773_354, %D.75812_363 - %ityp.8750_365 = load i32, i32* @__main1_MOD_ityp, align 4 - %D.75704_366 = sext i32 %ityp.8750_365 to i64 - %D.75814_367 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 3, i32 0), align 8 - %D.75815_368 = mul nsw i64 %D.75704_366, %D.75814_367 - %D.75847_369 = add nsw i64 %D.75846_364, %D.75815_368 - %D.75817_370 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 1), align 8 - %D.75848_371 = add nsw i64 %D.75847_369, %D.75817_370 - %tmp144 = bitcast i8* %D.75807_352 to [0 x float]* - %tmp145 = bitcast [0 x float]* %tmp144 to float* - %D.75849_372 = getelementptr inbounds float, float* %tmp145, i64 %D.75848_371 - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* %D.75849_372, i32 4) nounwind -; CHECK: @_gfortran_transfer_real_write - %D.75820_373 = load i8*, i8** getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 0), align 8 - %j1.8760_374 = load i32, i32* @j1.4581, align 4 - %D.75773_375 = sext i32 %j1.8760_374 to i64 - %iave.8736_376 = load i32, i32* @__main1_MOD_iave, align 4 - %D.75620_377 = sext i32 %iave.8736_376 to i64 - %D.75821_378 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 2, i32 0), align 8 - %D.75822_379 = mul nsw i64 %D.75620_377, %D.75821_378 - %igrp.8737_380 = load i32, i32* @__main1_MOD_igrp, align 4 - %D.75635_381 = sext i32 %igrp.8737_380 to i64 - %D.75823_382 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 1, i32 0), align 8 - %D.75824_383 = mul nsw i64 %D.75635_381, %D.75823_382 - %D.75825_384 = add nsw i64 %D.75822_379, %D.75824_383 - %D.75850_385 = add nsw i64 %D.75773_375, %D.75825_384 - %ityp.8750_386 = load i32, i32* @__main1_MOD_ityp, align 4 - %D.75704_387 = sext i32 %ityp.8750_386 to i64 - %D.75827_388 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 3, i32 0), align 8 - %D.75828_389 = mul nsw i64 %D.75704_387, %D.75827_388 - %D.75851_390 = add nsw i64 %D.75850_385, %D.75828_389 - %D.75830_391 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 1), align 8 - %D.75852_392 = add nsw i64 %D.75851_390, %D.75830_391 - %tmp146 = bitcast i8* %D.75820_373 to [0 x [1 x i8]]* - %tmp147 = bitcast [0 x [1 x i8]]* %tmp146 to [1 x i8]* - %D.75853_393 = getelementptr inbounds [1 x i8], [1 x i8]* %tmp147, i64 %D.75852_392 - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [1 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [1 x i8]* %D.75853_393, i32 1) nounwind -; CHECK: @_gfortran_transfer_character_write - %D.75833_394 = load i8*, i8** getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 0), align 8 - %j1.8760_395 = load i32, i32* @j1.4581, align 4 - %D.75773_396 = sext i32 %j1.8760_395 to i64 - %iave.8736_397 = load i32, i32* @__main1_MOD_iave, align 4 - %D.75620_398 = sext i32 %iave.8736_397 to i64 - %D.75834_399 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 2, i32 0), align 8 - %D.75835_400 = mul nsw i64 %D.75620_398, %D.75834_399 - %igrp.8737_401 = load i32, i32* @__main1_MOD_igrp, align 4 - %D.75635_402 = sext i32 %igrp.8737_401 to i64 - %D.75836_403 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 1, i32 0), align 8 - %D.75837_404 = mul nsw i64 %D.75635_402, %D.75836_403 - %D.75838_405 = add nsw i64 %D.75835_400, %D.75837_404 - %D.75854_406 = add nsw i64 %D.75773_396, %D.75838_405 - %ityp.8750_407 = load i32, i32* @__main1_MOD_ityp, align 4 - %D.75704_408 = sext i32 %ityp.8750_407 to i64 - %D.75840_409 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 3, i32 0), align 8 - %D.75841_410 = mul nsw i64 %D.75704_408, %D.75840_409 - %D.75855_411 = add nsw i64 %D.75854_406, %D.75841_410 - %D.75843_412 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 1), align 8 - %D.75856_413 = add nsw i64 %D.75855_411, %D.75843_412 - %tmp148 = bitcast i8* %D.75833_394 to [0 x i32]* - %tmp149 = bitcast [0 x i32]* %tmp148 to i32* - %D.75857_414 = getelementptr inbounds i32, i32* %tmp149, i64 %D.75856_413 - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* %D.75857_414, i32 4) nounwind -; CHECK: @_gfortran_transfer_integer_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @xr2.4593, i32 4) nounwind -; CHECK: @_gfortran_transfer_real_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @yr2.4595, i32 4) nounwind -; CHECK: @_gfortran_transfer_real_write - call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [2 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [2 x i8]* @nty2.4591, i32 2) nounwind -; CHECK: @_gfortran_transfer_character_write - call void @_gfortran_st_write_done(%struct.__st_parameter_dt* %memtmp3) nounwind -; CHECK: @_gfortran_st_write_done - %j.8758_415 = load i32, i32* @j.4580, align 4 - %D.4634_416 = icmp eq i32 %j.8758_415, %D.4627_188.reload - %j.8758_417 = load i32, i32* @j.4580, align 4 - %j.8770_418 = add nsw i32 %j.8758_417, 1 - store i32 %j.8770_418, i32* @j.4580, align 4 - %tmp150 = icmp ne i1 %D.4634_416, false - br i1 %tmp150, label %codeRepl80.exitStub, label %"._crit_edge.exitStub" -} - diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll deleted file mode 100644 index 368c38aa5ce..00000000000 --- a/test/Transforms/BBVectorize/ld1.ll +++ /dev/null @@ -1,41 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s - -define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { -entry: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %i2 = load double, double* %c, align 8 - %add = fadd double %mul, %i2 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - %arrayidx6 = getelementptr inbounds double, double* %c, i64 1 - %i5 = load double, double* %arrayidx6, align 8 - %add7 = fadd double %mul5, %i5 - %mul9 = fmul double %add, %i1 - %add11 = fadd double %mul9, %i2 - %mul13 = fmul double %add7, %i4 - %add15 = fadd double %mul13, %i5 - %mul16 = fmul double %add11, %add15 - ret double %mul16 -; CHECK-LABEL: @test1( -; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* -; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* -; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>* -; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8 -; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8 -; CHECK: %mul = fmul <2 x double> %i0, %i1 -; CHECK: %i2 = load <2 x double>, <2 x double>* %i2.v.i0, align 8 -; CHECK: %add = fadd <2 x double> %mul, %i2 -; CHECK: %mul9 = fmul <2 x double> %add, %i1 -; CHECK: %add11 = fadd <2 x double> %mul9, %i2 -; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0 -; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1 -; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2 -; CHECK: ret double %mul16 -} - diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg deleted file mode 100644 index e71f3cc4c41..00000000000 --- a/test/Transforms/BBVectorize/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if not 'X86' in config.root.targets: - config.unsupported = True - diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll deleted file mode 100644 index 8ff5953cf46..00000000000 --- a/test/Transforms/BBVectorize/loop1.ll +++ /dev/null @@ -1,93 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -dont-improve-non-negative-phi-bits=false -basicaa -loop-unroll -unroll-threshold=45 -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL -; The second check covers the use of alias analysis (with loop unrolling). - -define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable { -entry: - br label %for.body -; CHECK-LABEL: @test1( -; CHECK-UNRL-LABEL: @test1( - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv - %0 = load double, double* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv - %1 = load double, double* %arrayidx2, align 8 - %mul = fmul double %0, %0 - %mul3 = fmul double %0, %1 - %add = fadd double %mul, %mul3 - %add4 = fadd double %1, %1 - %add5 = fadd double %add4, %0 - %mul6 = fmul double %0, %add5 - %add7 = fadd double %add, %mul6 - %mul8 = fmul double %1, %1 - %add9 = fadd double %0, %0 - %add10 = fadd double %add9, %0 - %mul11 = fmul double %mul8, %add10 - %add12 = fadd double %add7, %mul11 - %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv - store double %add12, double* %arrayidx14, align 8 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 10 - br i1 %exitcond, label %for.end, label %for.body -; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] -; CHECK: %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv -; CHECK: %0 = load double, double* %arrayidx, align 8 -; CHECK: %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv -; CHECK: %1 = load double, double* %arrayidx2, align 8 -; CHECK: %mul = fmul double %0, %0 -; CHECK: %mul3 = fmul double %0, %1 -; CHECK: %add = fadd double %mul, %mul3 -; CHECK: %mul8 = fmul double %1, %1 -; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0 -; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1 -; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2 -; CHECK: %2 = insertelement <2 x double> undef, double %0, i32 0 -; CHECK: %add5.v.i1.2 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer -; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2 -; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %2, double %mul8, i32 1 -; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5 -; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0 -; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1 -; CHECK: %add7 = fadd double %add, %mul6.v.r1 -; CHECK: %add12 = fadd double %add7, %mul6.v.r2 -; CHECK: %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv -; CHECK: store double %add12, double* %arrayidx14, align 8 -; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1 -; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32 -; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10 -; CHECK: br i1 %exitcond, label %for.end, label %for.body -; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ] -; CHECK-UNRL: %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv -; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>* -; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv -; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>* -; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv -; CHECK-UNRL: %2 = load <2 x double>, <2 x double>* %0, align 8 -; CHECK-UNRL: %3 = load <2 x double>, <2 x double>* %1, align 8 -; CHECK-UNRL: %mul = fmul <2 x double> %2, %2 -; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3 -; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3 -; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3 -; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2 -; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5 -; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6 -; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3 -; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2 -; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2 -; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10 -; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11 -; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>* -; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8 -; CHECK-UNRL: %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2 -; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32 -; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10 -; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} diff --git a/test/Transforms/BBVectorize/mem-op-depth.ll b/test/Transforms/BBVectorize/mem-op-depth.ll deleted file mode 100644 index 732043b7f8e..00000000000 --- a/test/Transforms/BBVectorize/mem-op-depth.ll +++ /dev/null @@ -1,22 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=6 -instcombine -gvn -S | FileCheck %s - -@A = common global [1024 x float] zeroinitializer, align 16 -@B = common global [1024 x float] zeroinitializer, align 16 - -define i32 @test1() nounwind { -; CHECK-LABEL: @test1( - %V1 = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 0), align 16 - %V2 = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 1), align 4 - %V3= load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 2), align 8 - %V4 = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 3), align 4 -; CHECK: %V1 = load <4 x float>, <4 x float>* bitcast ([1024 x float]* @A to <4 x float>*), align 16 - store float %V1, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 0), align 16 - store float %V2, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 1), align 4 - store float %V3, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 2), align 8 - store float %V4, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 3), align 4 -; CHECK-NEXT: store <4 x float> %V1, <4 x float>* bitcast ([1024 x float]* @B to <4 x float>*), align 16 - ret i32 0 -; CHECK-NEXT: ret i32 0 -} diff --git a/test/Transforms/BBVectorize/metadata.ll b/test/Transforms/BBVectorize/metadata.ll deleted file mode 100644 index f5580a88861..00000000000 --- a/test/Transforms/BBVectorize/metadata.ll +++ /dev/null @@ -1,49 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -S | FileCheck %s - -; Simple 3-pair chain with loads and stores (with fpmath) -define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { -entry: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1, !fpmath !2 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4, !fpmath !3 - store double %mul, double* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - ret void -; CHECK-LABEL: @test1( -; CHECK: !fpmath -; CHECK: ret void -} - -; Simple 3-pair chain with loads and stores (ints with range) -define void @test2(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly { -entry: - %i0 = load i64, i64* %a, align 8, !range !0 - %i1 = load i64, i64* %b, align 8 - %mul = mul i64 %i0, %i1 - %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1 - %i3 = load i64, i64* %arrayidx3, align 8, !range !1 - %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1 - %i4 = load i64, i64* %arrayidx4, align 8 - %mul5 = mul i64 %i3, %i4 - store i64 %mul, i64* %c, align 8 - %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1 - store i64 %mul5, i64* %arrayidx5, align 8 - ret void -; CHECK-LABEL: @test2( -; CHECK-NOT: !range -; CHECK: ret void -} - -!0 = !{i64 0, i64 2} -!1 = !{i64 3, i64 5} - -!2 = !{ float 5.0 } -!3 = !{ float 2.5 } - diff --git a/test/Transforms/BBVectorize/no-ldstr-conn.ll b/test/Transforms/BBVectorize/no-ldstr-conn.ll deleted file mode 100644 index a84cd658560..00000000000 --- a/test/Transforms/BBVectorize/no-ldstr-conn.ll +++ /dev/null @@ -1,23 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=2 -instcombine -gvn -S | FileCheck %s - -; Make sure that things (specifically getelementptr) are not connected to loads -; and stores via the address operand (which would be bad because the address -; is really a scalar even after vectorization) -define i64 @test2(i64 %a) nounwind uwtable readonly { -entry: - %a1 = inttoptr i64 %a to i64* - %a2 = getelementptr i64, i64* %a1, i64 1 - %a3 = getelementptr i64, i64* %a1, i64 2 - %v2 = load i64, i64* %a2, align 8 - %v3 = load i64, i64* %a3, align 8 - %v2a = add i64 %v2, 5 - %v3a = add i64 %v3, 7 - store i64 %v2a, i64* %a2, align 8 - store i64 %v3a, i64* %a3, align 8 - %r = add i64 %v2, %v3 - ret i64 %r -; CHECK-LABEL: @test2( -; CHECK-NOT: getelementptr i64, <2 x i64*> -} - diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll deleted file mode 100644 index 2675354183a..00000000000 --- a/test/Transforms/BBVectorize/req-depth.ll +++ /dev/null @@ -1,17 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -bb-vectorize-ignore-target-info -S | FileCheck %s -check-prefix=CHECK-RD3 -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -bb-vectorize-ignore-target-info -S | FileCheck %s -check-prefix=CHECK-RD2 - -define double @test1(double %A1, double %A2, double %B1, double %B2) { - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %R = fmul double %Y1, %Y2 - ret double %R -; CHECK-RD3-LABEL: @test1( -; CHECK-RD2-LABEL: @test1( -; CHECK-RD3-NOT: <2 x double> -; CHECK-RD2: <2 x double> -} - diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll deleted file mode 100644 index be38d340260..00000000000 --- a/test/Transforms/BBVectorize/search-limit.ll +++ /dev/null @@ -1,46 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4 - -define double @test1(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test1( -; CHECK-SL4-LABEL: @test1( -; CHECK-SL4-NOT: <2 x double> -; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 -; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 -; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 -; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 -; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 -; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 - %Z1 = fadd double %Y1, %B1 - ; Here we have a dependency chain: the short search limit will not - ; see past this chain and so will not see the second part of the - ; pair to vectorize. - %mul41 = fmul double %Z1, %Y2 - %sub48 = fsub double %Z1, %mul41 - %mul62 = fmul double %Z1, %sub48 - %sub69 = fsub double %Z1, %mul62 - %mul83 = fmul double %Z1, %sub69 - %sub90 = fsub double %Z1, %mul83 - %mul104 = fmul double %Z1, %sub90 - %sub111 = fsub double %Z1, %mul104 - %mul125 = fmul double %Z1, %sub111 - %sub132 = fsub double %Z1, %mul125 - %mul146 = fmul double %Z1, %sub132 - %sub153 = fsub double %Z1, %mul146 - ; end of chain. - %Z2 = fadd double %Y2, %B2 -; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 - %R1 = fdiv double %Z1, %Z2 - %R = fmul double %R1, %sub153 -; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 -; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 -; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2 - ret double %R -; CHECK: ret double %R -} - diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll deleted file mode 100644 index dd5e90841a7..00000000000 --- a/test/Transforms/BBVectorize/simple-int.ll +++ /dev/null @@ -1,514 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" - -declare double @llvm.fma.f64(double, double, double) -declare double @llvm.fmuladd.f64(double, double, double) -declare double @llvm.cos.f64(double) -declare double @llvm.powi.f64(double, i32) -declare double @llvm.round.f64(double) -declare double @llvm.copysign.f64(double, double) -declare double @llvm.ceil.f64(double) -declare double @llvm.nearbyint.f64(double) -declare double @llvm.rint.f64(double) -declare double @llvm.trunc.f64(double) -declare double @llvm.floor.f64(double) -declare double @llvm.fabs.f64(double) -declare i64 @llvm.bswap.i64(i64) -declare i64 @llvm.ctpop.i64(i64) -declare i64 @llvm.ctlz.i64(i64, i1) -declare i64 @llvm.cttz.i64(i64, i1) - -; Basic depth-3 chain with fma -define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1_V_I2_1:%.*]] = insertelement <2 x double> undef, double [[C1:%.*]], i32 0 -; CHECK-NEXT: [[Y1_V_I2_2:%.*]] = insertelement <2 x double> [[Y1_V_I2_1]], double [[C2:%.*]], i32 1 -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X1]], <2 x double> [[X1_V_I0_2]], <2 x double> [[Y1_V_I2_2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1) - %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with fmuladd -define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { -; CHECK-LABEL: @test1a( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1_V_I2_1:%.*]] = insertelement <2 x double> undef, double [[C1:%.*]], i32 0 -; CHECK-NEXT: [[Y1_V_I2_2:%.*]] = insertelement <2 x double> [[Y1_V_I2_1]], double [[C2:%.*]], i32 1 -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[X1]], <2 x double> [[X1_V_I0_2]], <2 x double> [[Y1_V_I2_2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1) - %Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with cos -define double @test2(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.cos.f64(double %X1) - %Y2 = call double @llvm.cos.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with powi -define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.powi.v2f64(<2 x double> [[X1]], i32 [[P:%.*]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.powi.f64(double %X1, i32 %P) - %Y2 = call double @llvm.powi.f64(double %X2, i32 %P) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with powi (different powers: should not vectorize) -define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[P2:%.*]] = add i32 [[P:%.*]], 1 -; CHECK-NEXT: [[Y1:%.*]] = call double @llvm.powi.f64(double [[X1]], i32 [[P]]) -; CHECK-NEXT: [[Y2:%.*]] = call double @llvm.powi.f64(double [[X2]], i32 [[P2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd double [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = fadd double [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1]], [[Z2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %P2 = add i32 %P, 1 - %Y1 = call double @llvm.powi.f64(double %X1, i32 %P) - %Y2 = call double @llvm.powi.f64(double %X2, i32 %P2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with round -define double @testround(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testround( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.round.f64(double %X1) - %Y2 = call double @llvm.round.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with copysign -define double @testcopysign(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testcopysign( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1_V_I1_2:%.*]] = shufflevector <2 x double> [[X1_V_I0_1]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[X1]], <2 x double> [[Y1_V_I1_2]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.copysign.f64(double %X1, double %A1) - %Y2 = call double @llvm.copysign.f64(double %X2, double %A1) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with ceil -define double @testceil(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testceil( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.ceil.f64(double %X1) - %Y2 = call double @llvm.ceil.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with nearbyint -define double @testnearbyint(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testnearbyint( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.nearbyint.f64(double %X1) - %Y2 = call double @llvm.nearbyint.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with rint -define double @testrint(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testrint( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.rint.f64(double %X1) - %Y2 = call double @llvm.rint.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with trunc -define double @testtrunc(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testtrunc( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.trunc.f64(double %X1) - %Y2 = call double @llvm.trunc.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with floor -define double @testfloor(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testfloor( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.floor.f64(double %X1) - %Y2 = call double @llvm.floor.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with fabs -define double @testfabs(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @testfabs( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = call double @llvm.fabs.f64(double %X1) - %Y2 = call double @llvm.fabs.f64(double %X2) - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain with bswap -define i64 @testbswap(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @testbswap( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = call i64 @llvm.bswap.i64(i64 %X1) - %Y2 = call i64 @llvm.bswap.i64(i64 %X2) - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R -} - -; Basic depth-3 chain with ctpop -define i64 @testctpop(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @testctpop( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[X1]]) -; CHECK-NEXT: [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = call i64 @llvm.ctpop.i64(i64 %X1) - %Y2 = call i64 @llvm.ctpop.i64(i64 %X2) - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R -} - -; Basic depth-3 chain with ctlz -define i64 @testctlz(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @testctlz( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[X1]], i1 true) -; CHECK-NEXT: [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true) - %Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 true) - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R - -} - -; Basic depth-3 chain with ctlz -define i64 @testctlzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @testctlzneg( -; CHECK-NEXT: [[X1:%.*]] = sub i64 [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = sub i64 [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[Y1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[X1]], i1 true), !range !0 -; CHECK-NEXT: [[Y2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[X2]], i1 false), !range !0 -; CHECK-NEXT: [[Z1:%.*]] = add i64 [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = add i64 [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1]], [[Z2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true) - %Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 false) - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R -} - -; Basic depth-3 chain with cttz -define i64 @testcttz(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @testcttz( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[X1]], i1 true) -; CHECK-NEXT: [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true) - %Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 true) - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R - -} - -; Basic depth-3 chain with cttz -define i64 @testcttzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @testcttzneg( -; CHECK-NEXT: [[X1:%.*]] = sub i64 [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[X2:%.*]] = sub i64 [[A2:%.*]], [[B2:%.*]] -; CHECK-NEXT: [[Y1:%.*]] = call i64 @llvm.cttz.i64(i64 [[X1]], i1 true), !range !0 -; CHECK-NEXT: [[Y2:%.*]] = call i64 @llvm.cttz.i64(i64 [[X2]], i1 false), !range !0 -; CHECK-NEXT: [[Z1:%.*]] = add i64 [[Y1]], [[B1]] -; CHECK-NEXT: [[Z2:%.*]] = add i64 [[Y2]], [[B2]] -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1]], [[Z2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true) - %Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 false) - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R -} - -; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0 -; CHECK: declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) #0 -; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) #0 -; CHECK: declare <2 x double> @llvm.round.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0 -; CHECK: declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.floor.v2f64(<2 x double>) #0 -; CHECK: declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0 -; CHECK: declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #0 -; CHECK: declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) #0 -; CHECK: declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0 -; CHECK: declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) #0 -; CHECK: attributes #0 = { nounwind readnone speculatable } diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll deleted file mode 100644 index fcc0236bae9..00000000000 --- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll +++ /dev/null @@ -1,134 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO - -; FIXME: re-enable this once pointer vectors work properly -; XFAIL: * - -; Simple 3-pair chain also with loads and stores (using ptrs and gep) -define double @test1(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly { -entry: - %i0 = load i64, i64* %a, align 8 - %i1 = load i64, i64* %b, align 8 - %mul = mul i64 %i0, %i1 - %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1 - %i3 = load i64, i64* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1 - %i4 = load i64, i64* %arrayidx4, align 8 - %mul5 = mul i64 %i3, %i4 - %ptr = inttoptr i64 %mul to double* - %ptr5 = inttoptr i64 %mul5 to double* - %aptr = getelementptr inbounds double, double* %ptr, i64 2 - %aptr5 = getelementptr inbounds double, double* %ptr5, i64 3 - %av = load double, double* %aptr, align 16 - %av5 = load double, double* %aptr5, align 16 - %r = fmul double %av, %av5 - store i64 %mul, i64* %c, align 8 - %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1 - store i64 %mul5, i64* %arrayidx5, align 8 - ret double %r -; CHECK-LABEL: @test1( -; CHECK: %i0.v.i0 = bitcast i64* %a to <2 x i64>* -; CHECK: %i1.v.i0 = bitcast i64* %b to <2 x i64>* -; CHECK: %i0 = load <2 x i64>, <2 x i64>* %i0.v.i0, align 8 -; CHECK: %i1 = load <2 x i64>, <2 x i64>* %i1.v.i0, align 8 -; CHECK: %mul = mul <2 x i64> %i0, %i1 -; CHECK: %ptr = inttoptr <2 x i64> %mul to <2 x double*> -; CHECK: %aptr = getelementptr inbounds double, <2 x double*> %ptr, <2 x i64> -; CHECK: %aptr.v.r1 = extractelement <2 x double*> %aptr, i32 0 -; CHECK: %aptr.v.r2 = extractelement <2 x double*> %aptr, i32 1 -; CHECK: %av = load double, double* %aptr.v.r1, align 16 -; CHECK: %av5 = load double, double* %aptr.v.r2, align 16 -; CHECK: %r = fmul double %av, %av5 -; CHECK: %0 = bitcast i64* %c to <2 x i64>* -; CHECK: store <2 x i64> %mul, <2 x i64>* %0, align 8 -; CHECK: ret double %r -; CHECK-AO-LABEL: @test1( -; CHECK-AO-NOT: load <2 x -} - -; Simple 3-pair chain with loads and stores (using ptrs and gep) -define void @test2(i64** %a, i64** %b, i64** %c) nounwind uwtable readonly { -entry: - %i0 = load i64*, i64** %a, align 8 - %i1 = load i64*, i64** %b, align 8 - %arrayidx3 = getelementptr inbounds i64*, i64** %a, i64 1 - %i3 = load i64*, i64** %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds i64*, i64** %b, i64 1 - %i4 = load i64*, i64** %arrayidx4, align 8 - %o1 = load i64, i64* %i1, align 8 - %o4 = load i64, i64* %i4, align 8 - %ptr0 = getelementptr inbounds i64, i64* %i0, i64 %o1 - %ptr3 = getelementptr inbounds i64, i64* %i3, i64 %o4 - store i64* %ptr0, i64** %c, align 8 - %arrayidx5 = getelementptr inbounds i64*, i64** %c, i64 1 - store i64* %ptr3, i64** %arrayidx5, align 8 - ret void -; CHECK-LABEL: @test2( -; CHECK: %i0.v.i0 = bitcast i64** %a to <2 x i64*>* -; CHECK: %i1 = load i64*, i64** %b, align 8 -; CHECK: %i0 = load <2 x i64*>, <2 x i64*>* %i0.v.i0, align 8 -; CHECK: %arrayidx4 = getelementptr inbounds i64*, i64** %b, i64 1 -; CHECK: %i4 = load i64*, i64** %arrayidx4, align 8 -; CHECK: %o1 = load i64, i64* %i1, align 8 -; CHECK: %o4 = load i64, i64* %i4, align 8 -; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0 -; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1 -; CHECK: %ptr0 = getelementptr inbounds i64, <2 x i64*> %i0, <2 x i64> %ptr0.v.i1.2 -; CHECK: %0 = bitcast i64** %c to <2 x i64*>* -; CHECK: store <2 x i64*> %ptr0, <2 x i64*>* %0, align 8 -; CHECK: ret void -; CHECK-AO-LABEL: @test2( -; CHECK-AO-NOT: <2 x -} - -; Simple 3-pair chain with loads and stores (using ptrs and gep) -; using pointer vectors. -define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly { -entry: - %i0 = load <2 x i64*>, <2 x i64*>* %a, align 8 - %i1 = load <2 x i64*>, <2 x i64*>* %b, align 8 - %arrayidx3 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %a, i64 1 - %i3 = load <2 x i64*>, <2 x i64*>* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %b, i64 1 - %i4 = load <2 x i64*>, <2 x i64*>* %arrayidx4, align 8 - %j1 = extractelement <2 x i64*> %i1, i32 0 - %j4 = extractelement <2 x i64*> %i4, i32 0 - %o1 = load i64, i64* %j1, align 8 - %o4 = load i64, i64* %j4, align 8 - %j0 = extractelement <2 x i64*> %i0, i32 0 - %j3 = extractelement <2 x i64*> %i3, i32 0 - %ptr0 = getelementptr inbounds i64, i64* %j0, i64 %o1 - %ptr3 = getelementptr inbounds i64, i64* %j3, i64 %o4 - %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0 - %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1 - %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0 - %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1 - store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8 - %arrayidx5 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %c, i64 1 - store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8 - ret void -; CHECK-LABEL: @test3( -; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>* -; CHECK: %i1 = load <2 x i64*>, <2 x i64*>* %b, align 8 -; CHECK: %i0 = load <4 x i64*>, <4 x i64*>* %i0.v.i0, align 8 -; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %b, i64 1 -; CHECK: %i4 = load <2 x i64*>, <2 x i64*>* %arrayidx4, align 8 -; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0 -; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0 -; CHECK: %o1 = load i64, i64* %j1, align 8 -; CHECK: %o4 = load i64, i64* %j4, align 8 -; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0 -; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1 -; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> -; CHECK: %ptr0 = getelementptr inbounds i64, <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2 -; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer -; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> -; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>* -; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> -; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8 -; CHECK: ret void -; CHECK-AO-LABEL: @test3( -; CHECK-AO-NOT: <4 x -} - diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll deleted file mode 100644 index 56c1a06b42e..00000000000 --- a/test/Transforms/BBVectorize/simple-ldstr.ll +++ /dev/null @@ -1,170 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO - -; Simple 3-pair chain with loads and stores -define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { -entry: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - store double %mul, double* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - ret void -; CHECK-LABEL: @test1( -; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* -; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* -; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8 -; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8 -; CHECK: %mul = fmul <2 x double> %i0, %i1 -; CHECK: %0 = bitcast double* %c to <2 x double>* -; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 -; CHECK: ret void -; CHECK-AO-LABEL: @test1( -; CHECK-AO-NOT: <2 x double> -} - -; Simple chain with extending loads and stores -define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly { -entry: - %i0f = load float, float* %a, align 4 - %i0 = fpext float %i0f to double - %i1f = load float, float* %b, align 4 - %i1 = fpext float %i1f to double - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds float, float* %a, i64 1 - %i3f = load float, float* %arrayidx3, align 4 - %i3 = fpext float %i3f to double - %arrayidx4 = getelementptr inbounds float, float* %b, i64 1 - %i4f = load float, float* %arrayidx4, align 4 - %i4 = fpext float %i4f to double - %mul5 = fmul double %i3, %i4 - store double %mul, double* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - ret void -; CHECK-LABEL: @test2( -; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>* -; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>* -; CHECK: %i0f = load <2 x float>, <2 x float>* %i0f.v.i0, align 4 -; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double> -; CHECK: %i1f = load <2 x float>, <2 x float>* %i1f.v.i0, align 4 -; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double> -; CHECK: %mul = fmul <2 x double> %i0, %i1 -; CHECK: %0 = bitcast double* %c to <2 x double>* -; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 -; CHECK: ret void -; CHECK-AO-LABEL: @test2( -; CHECK-AO-NOT: <2 x double> -} - -; Simple chain with loads and truncating stores -define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly { -entry: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %mulf = fptrunc double %mul to float - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - %mul5f = fptrunc double %mul5 to float - store float %mulf, float* %c, align 8 - %arrayidx5 = getelementptr inbounds float, float* %c, i64 1 - store float %mul5f, float* %arrayidx5, align 4 - ret void -; CHECK-LABEL: @test3( -; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* -; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* -; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8 -; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8 -; CHECK: %mul = fmul <2 x double> %i0, %i1 -; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float> -; CHECK: %0 = bitcast float* %c to <2 x float>* -; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8 -; CHECK: ret void -; CHECK-AO-LABEL: @test3( -; CHECK-AO: %i0 = load double, double* %a, align 8 -; CHECK-AO: %i1 = load double, double* %b, align 8 -; CHECK-AO: %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 -; CHECK-AO: %i3 = load double, double* %arrayidx3, align 8 -; CHECK-AO: %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 -; CHECK-AO: %i4 = load double, double* %arrayidx4, align 8 -; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0 -; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1 -; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0 -; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1 -; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2 -; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float> -; CHECK-AO: %0 = bitcast float* %c to <2 x float>* -; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8 -; CHECK-AO: ret void -} - -; Simple 3-pair chain with loads and stores (unreachable) -define void @test4(i1 %bool, double* %a, double* %b, double* %c) nounwind uwtable readonly { -entry: - br i1 %bool, label %if.then1, label %if.end - -if.then1: - unreachable - br label %if.then - -if.then: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - store double %mul, double* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - br label %if.end - -if.end: - ret void -; CHECK-LABEL: @test4( -; CHECK-NOT: <2 x double> -; CHECK-AO-LABEL: @test4( -; CHECK-AO-NOT: <2 x double> -} - -; Simple 3-pair chain with loads and stores -define void @test5(double* %a, double* %b, double* %c) nounwind uwtable readonly { -entry: - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - store double %mul, double* %c, align 4 - ret void -; CHECK-LABEL: @test5( -; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* -; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* -; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8 -; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8 -; CHECK: %mul = fmul <2 x double> %i0, %i1 -; CHECK: %0 = bitcast double* %c to <2 x double>* -; CHECK: store <2 x double> %mul, <2 x double>* %0, align 4 -; CHECK: ret void -; CHECK-AO-LABEL: @test5( -; CHECK-AO-NOT: <2 x double> -} - diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll deleted file mode 100644 index 269b07f82d1..00000000000 --- a/test/Transforms/BBVectorize/simple-sel.ll +++ /dev/null @@ -1,59 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-no-bools -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-NB - -; Basic depth-3 chain with select -define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) { -; CHECK-LABEL: @test1( -; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 -; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 -; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 -; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 -; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 -; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 - %Z1 = select i1 %C1, double %Y1, double %B1 - %Z2 = select i1 %C2, double %Y2, double %B2 -; CHECK: %Z1.v.i0.1 = insertelement <2 x i1> undef, i1 %C1, i32 0 -; CHECK: %Z1.v.i0.2 = insertelement <2 x i1> %Z1.v.i0.1, i1 %C2, i32 1 -; CHECK: %Z1 = select <2 x i1> %Z1.v.i0.2, <2 x double> %Y1, <2 x double> %X1.v.i1.2 - %R = fmul double %Z1, %Z2 -; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 -; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 -; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 - ret double %R -; CHECK: ret double %R -} - -; Basic depth-3 chain with select (and vect. compare) -define double @test2(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test2( -; CHECK-NB-LABEL: @test2( -; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 -; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 -; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 -; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 -; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 -; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 - %C1 = fcmp ogt double %X1, %A1 - %C2 = fcmp ogt double %X2, %A2 -; CHECK: %C1 = fcmp ogt <2 x double> %X1, %X1.v.i0.2 -; CHECK-NB: fcmp ogt double - %Z1 = select i1 %C1, double %Y1, double %B1 - %Z2 = select i1 %C2, double %Y2, double %B2 -; CHECK: %Z1 = select <2 x i1> %C1, <2 x double> %Y1, <2 x double> %X1.v.i1.2 - %R = fmul double %Z1, %Z2 -; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 -; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 -; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 - ret double %R -; CHECK: ret double %R -} - diff --git a/test/Transforms/BBVectorize/simple-tst.ll b/test/Transforms/BBVectorize/simple-tst.ll deleted file mode 100644 index 6a88e1b09c1..00000000000 --- a/test/Transforms/BBVectorize/simple-tst.ll +++ /dev/null @@ -1,18 +0,0 @@ -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux" -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=256 -instcombine -gvn -S | FileCheck %s - -; Basic depth-3 chain (target-specific type should not vectorize) -define ppc_fp128 @test7(ppc_fp128 %A1, ppc_fp128 %A2, ppc_fp128 %B1, ppc_fp128 %B2) { -; CHECK-LABEL: @test7( -; CHECK-NOT: <2 x ppc_fp128> - %X1 = fsub ppc_fp128 %A1, %B1 - %X2 = fsub ppc_fp128 %A2, %B2 - %Y1 = fmul ppc_fp128 %X1, %A1 - %Y2 = fmul ppc_fp128 %X2, %A2 - %Z1 = fadd ppc_fp128 %Y1, %B1 - %Z2 = fadd ppc_fp128 %Y2, %B2 - %R = fmul ppc_fp128 %Z1, %Z2 - ret ppc_fp128 %R -} - diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll deleted file mode 100644 index 12f97ab77ba..00000000000 --- a/test/Transforms/BBVectorize/simple.ll +++ /dev/null @@ -1,209 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" - -; Basic depth-3 chain -define double @test1(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain (last pair permuted) -define double @test2(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0 -; CHECK-NEXT: [[Z1_V_I1_2:%.*]] = insertelement <2 x double> [[Z1_V_I1_1]], double [[B1]], i32 1 -; CHECK-NEXT: [[Z2:%.*]] = fadd <2 x double> [[Y1]], [[Z1_V_I1_2]] -; CHECK-NEXT: [[Z2_V_R1:%.*]] = extractelement <2 x double> [[Z2]], i32 0 -; CHECK-NEXT: [[Z2_V_R2:%.*]] = extractelement <2 x double> [[Z2]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z2_V_R2]], [[Z2_V_R1]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y2, %B1 - %Z2 = fadd double %Y1, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain (last pair first splat) -define double @test3(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1_V_I0:%.*]] = shufflevector <2 x double> [[Y1]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Z1_V_I0]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y2, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain (last pair second splat) -define double @test4(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1_V_I0:%.*]] = shufflevector <2 x double> [[Y1]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Z1_V_I0]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y1, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain -define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) { -; CHECK-LABEL: @test5( -; CHECK-NEXT: [[X1_V_I1:%.*]] = shufflevector <2 x float> [[B1:%.*]], <2 x float> [[B2:%.*]], <4 x i32> -; CHECK-NEXT: [[X1_V_I0:%.*]] = shufflevector <2 x float> [[A1:%.*]], <2 x float> [[A2:%.*]], <4 x i32> -; CHECK-NEXT: [[X1:%.*]] = fsub <4 x float> [[X1_V_I0]], [[X1_V_I1]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <4 x float> [[X1]], [[X1_V_I0]] -; CHECK-NEXT: [[Z1:%.*]] = fadd <4 x float> [[Y1]], [[X1_V_I1]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = shufflevector <4 x float> [[Z1]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[Z1_V_R2:%.*]] = shufflevector <4 x float> [[Z1]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[R:%.*]] = fmul <2 x float> [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret <2 x float> [[R]] -; - %X1 = fsub <2 x float> %A1, %B1 - %X2 = fsub <2 x float> %A2, %B2 - %Y1 = fmul <2 x float> %X1, %A1 - %Y2 = fmul <2 x float> %X2, %A2 - %Z1 = fadd <2 x float> %Y1, %B1 - %Z2 = fadd <2 x float> %Y2, %B2 - %R = fmul <2 x float> %Z1, %Z2 - ret <2 x float> %R -} - -; Basic chain with shuffles -define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { -; CHECK-LABEL: @test6( -; CHECK-NEXT: [[X1_V_I1:%.*]] = shufflevector <8 x i8> [[B1:%.*]], <8 x i8> [[B2:%.*]], <16 x i32> -; CHECK-NEXT: [[X1_V_I0:%.*]] = shufflevector <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]], <16 x i32> -; CHECK-NEXT: [[X1:%.*]] = sub <16 x i8> [[X1_V_I0]], [[X1_V_I1]] -; CHECK-NEXT: [[Y1:%.*]] = mul <16 x i8> [[X1]], [[X1_V_I0]] -; CHECK-NEXT: [[Z1:%.*]] = add <16 x i8> [[Y1]], [[X1_V_I1]] -; CHECK-NEXT: [[Q1_V_I1:%.*]] = shufflevector <16 x i8> [[Z1]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[Q1:%.*]] = shufflevector <16 x i8> [[Z1]], <16 x i8> [[Q1_V_I1]], <16 x i32> -; CHECK-NEXT: [[Q1_V_R1:%.*]] = shufflevector <16 x i8> [[Q1]], <16 x i8> undef, <8 x i32> -; CHECK-NEXT: [[Q1_V_R2:%.*]] = shufflevector <16 x i8> [[Q1]], <16 x i8> undef, <8 x i32> -; CHECK-NEXT: [[R:%.*]] = mul <8 x i8> [[Q1_V_R1]], [[Q1_V_R2]] -; CHECK-NEXT: ret <8 x i8> [[R]] -; - %X1 = sub <8 x i8> %A1, %B1 - %X2 = sub <8 x i8> %A2, %B2 - %Y1 = mul <8 x i8> %X1, %A1 - %Y2 = mul <8 x i8> %X2, %A2 - %Z1 = add <8 x i8> %Y1, %B1 - %Z2 = add <8 x i8> %Y2, %B2 - %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> - %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> - %R = mul <8 x i8> %Q1, %Q2 - ret <8 x i8> %R -} - -; Basic depth-3 chain (flipped order) -define double @test7(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test7( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z2 = fadd double %Y2, %B2 - %Z1 = fadd double %Y1, %B1 - %R = fmul double %Z1, %Z2 - ret double %R -} - -; Basic depth-3 chain (subclass data) -define i64 @test8(i64 %A1, i64 %A2, i64 %B1, i64 %B2) { -; CHECK-LABEL: @test8( -; CHECK-NEXT: [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Y1:%.*]] = mul <2 x i64> [[X1]], [[X1_V_I0_2]] -; CHECK-NEXT: [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]] -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1 -; CHECK-NEXT: [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: ret i64 [[R]] -; - %X1 = sub nsw i64 %A1, %B1 - %X2 = sub i64 %A2, %B2 - %Y1 = mul i64 %X1, %A1 - %Y2 = mul i64 %X2, %A2 - %Z1 = add i64 %Y1, %B1 - %Z2 = add i64 %Y2, %B2 - %R = mul i64 %Z1, %Z2 - ret i64 %R -} - diff --git a/test/Transforms/BBVectorize/simple3.ll b/test/Transforms/BBVectorize/simple3.ll deleted file mode 100644 index 7dd538bdfb0..00000000000 --- a/test/Transforms/BBVectorize/simple3.ll +++ /dev/null @@ -1,38 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=192 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" - -; Basic depth-3 chain -define double @test1(double %A1, double %A2, double %A3, double %B1, double %B2, double %B3) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[X1_V_I1_11:%.*]] = insertelement <3 x double> undef, double [[B1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I1_22:%.*]] = insertelement <3 x double> [[X1_V_I1_11]], double [[B2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I1:%.*]] = insertelement <3 x double> [[X1_V_I1_22]], double [[B3:%.*]], i32 2 -; CHECK-NEXT: [[X1_V_I0_13:%.*]] = insertelement <3 x double> undef, double [[A1:%.*]], i32 0 -; CHECK-NEXT: [[X1_V_I0_24:%.*]] = insertelement <3 x double> [[X1_V_I0_13]], double [[A2:%.*]], i32 1 -; CHECK-NEXT: [[X1_V_I0:%.*]] = insertelement <3 x double> [[X1_V_I0_24]], double [[A3:%.*]], i32 2 -; CHECK-NEXT: [[X1:%.*]] = fsub <3 x double> [[X1_V_I0]], [[X1_V_I1]] -; CHECK-NEXT: [[Y1:%.*]] = fmul <3 x double> [[X1]], [[X1_V_I0]] -; CHECK-NEXT: [[Z1:%.*]] = fadd <3 x double> [[Y1]], [[X1_V_I1]] -; CHECK-NEXT: [[Z1_V_R210:%.*]] = extractelement <3 x double> [[Z1]], i32 2 -; CHECK-NEXT: [[Z1_V_R1:%.*]] = extractelement <3 x double> [[Z1]], i32 0 -; CHECK-NEXT: [[Z1_V_R2:%.*]] = extractelement <3 x double> [[Z1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]] -; CHECK-NEXT: [[R:%.*]] = fmul double [[R1]], [[Z1_V_R210]] -; CHECK-NEXT: ret double [[R]] -; - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %X3 = fsub double %A3, %B3 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Y3 = fmul double %X3, %A3 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %Z3 = fadd double %Y3, %B3 - %R1 = fmul double %Z1, %Z2 - %R = fmul double %R1, %Z3 - ret double %R -} - diff --git a/test/Transforms/BBVectorize/vector-sel.ll b/test/Transforms/BBVectorize/vector-sel.ll deleted file mode 100644 index bc15073b5a1..00000000000 --- a/test/Transforms/BBVectorize/vector-sel.ll +++ /dev/null @@ -1,43 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -bb-vectorize -S | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@d = external global [1 x [10 x [1 x i16]]], align 16 - -define void @test() { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BOOL:%.*]] = icmp ne i32 undef, 0 -; CHECK-NEXT: [[BOOLVEC:%.*]] = icmp ne <4 x i32> undef, zeroinitializer -; CHECK-NEXT: br label [[BODY:%.*]] -; CHECK: body: -; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[BOOL]], <4 x i16> , <4 x i16> -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL]], <4 x i16> , <4 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[BOOLVEC]], <4 x i1> [[BOOLVEC]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> , <8 x i16> [[TMP2]] -; CHECK-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr ([1 x [10 x [1 x i16]]], [1 x [10 x [1 x i16]]]* @d, i64 0, i64 0, i64 undef, i64 0) to <8 x i16>*), align 2 -; CHECK-NEXT: ret void -; -entry: - %bool = icmp ne i32 undef, 0 - %boolvec = icmp ne <4 x i32> undef, zeroinitializer - br label %body - -body: - %0 = select i1 %bool, <4 x i16> , <4 x i16> - %1 = select i1 %bool, <4 x i16> , <4 x i16> - %2 = select <4 x i1> %boolvec, <4 x i16> , <4 x i16> %0 - %3 = select <4 x i1> %boolvec, <4 x i16> , <4 x i16> %1 - %4 = add nsw <4 x i16> %2, zeroinitializer - %5 = add nsw <4 x i16> %3, zeroinitializer - %6 = getelementptr inbounds [1 x [10 x [1 x i16]]], [1 x [10 x [1 x i16]]]* @d, i64 0, i64 0, i64 undef, i64 0 - %7 = bitcast i16* %6 to <4 x i16>* - store <4 x i16> %4, <4 x i16>* %7, align 2 - %8 = getelementptr [1 x [10 x [1 x i16]]], [1 x [10 x [1 x i16]]]* @d, i64 0, i64 0, i64 undef, i64 4 - %9 = bitcast i16* %8 to <4 x i16>* - store <4 x i16> %5, <4 x i16>* %9, align 2 - ret void -} diff --git a/test/Transforms/BBVectorize/xcore/no-vector-registers.ll b/test/Transforms/BBVectorize/xcore/no-vector-registers.ll deleted file mode 100644 index 9ebdb7368a3..00000000000 --- a/test/Transforms/BBVectorize/xcore/no-vector-registers.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S -mtriple=xcore | FileCheck %s - -target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32" -target triple = "xcore" - -; Basic depth-3 chain -define double @test1(double %A1, double %A2, double %B1, double %B2) { -; CHECK-LABEL: @test1( -; CHECK-NOT: <2 x double> - %X1 = fsub double %A1, %B1 - %X2 = fsub double %A2, %B2 - %Y1 = fmul double %X1, %A1 - %Y2 = fmul double %X2, %A2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 - %R = fmul double %Z1, %Z2 - ret double %R -}