1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00

Revert r318193 "[SLPVectorizer] Failure to beneficially vectorize 'copyable' elements in integer binary ops."

It crashes building sqlite; see reply on the llvm-commits thread.

> [SLPVectorizer] Failure to beneficially vectorize 'copyable' elements in integer binary ops.
>
>         Patch tries to improve vectorization of the following code:
>
>         void add1(int * __restrict dst, const int * __restrict src) {
>           *dst++ = *src++;
>           *dst++ = *src++ + 1;
>           *dst++ = *src++ + 2;
>           *dst++ = *src++ + 3;
>         }
>         Allows to vectorize even if the very first operation is not a binary add, but just a load.
>
>         Fixed issues related to previous commit.
>
>         Reviewers: spatel, mzolotukhin, mkuper, hfinkel, RKSimon, filcab, ABataev
>
>         Reviewed By: ABataev, RKSimon
>
>         Subscribers: llvm-commits, RKSimon
>
>         Differential Revision: https://reviews.llvm.org/D28907

llvm-svn: 318239
This commit is contained in:
Hans Wennborg 2017-11-15 00:38:13 +00:00
parent 1bb0b7dfdf
commit 4937b695da
3 changed files with 272 additions and 438 deletions

View File

@ -333,7 +333,7 @@ static unsigned getAltOpcode(unsigned Op) {
case Instruction::Sub: case Instruction::Sub:
return Instruction::Add; return Instruction::Add;
default: default:
return Op; return 0;
} }
} }
@ -346,20 +346,6 @@ static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
} }
/// Checks if the \p Opcode can be considered as an operand of a (possibly)
/// binary operation \p I.
/// \returns The code of the binary operation of instruction \p I if the
/// instruction with \p Opcode can be considered as an operand of \p I with the
/// default value.
static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode())
&& "Invalid Opcode");
if (Opcode != Instruction::PHI && isa<BinaryOperator>(I) &&
(I->getType()->isIntegerTy() || cast<FPMathOperator>(I)->isFast()))
return I->getOpcode();
return 0;
}
/// Chooses the correct key for scheduling data. If \p Op has the same (or /// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
/// OpValue. /// OpValue.
@ -381,12 +367,7 @@ namespace {
struct RawInstructionsData { struct RawInstructionsData {
/// Main Opcode of the instructions going to be vectorized. /// Main Opcode of the instructions going to be vectorized.
unsigned Opcode = 0; unsigned Opcode = 0;
/// Position of the first instruction with the \a Opcode.
unsigned OpcodePos = 0;
/// Need an additional analysis (if at least one of the instruction is not
/// same instruction kind as an instruction at OpcodePos position in the
/// list).
bool NeedAnalysis = false;
/// The list of instructions have some instructions with alternate opcodes. /// The list of instructions have some instructions with alternate opcodes.
bool HasAltOpcodes = false; bool HasAltOpcodes = false;
}; };
@ -401,38 +382,16 @@ static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
return {}; return {};
RawInstructionsData Res; RawInstructionsData Res;
unsigned Opcode = I0->getOpcode(); unsigned Opcode = I0->getOpcode();
unsigned AltOpcode = getAltOpcode(Opcode);
unsigned NewOpcodePos = 0;
// Walk through the list of the vectorized instructions // Walk through the list of the vectorized instructions
// in order to check its structure described by RawInstructionsData. // in order to check its structure described by RawInstructionsData.
for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
auto *I = dyn_cast<Instruction>(VL[Cnt]); auto *I = dyn_cast<Instruction>(VL[Cnt]);
if (!I) if (!I)
return {}; return {};
if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { if (Opcode != I->getOpcode())
if (Opcode != I->getOpcode()) { Res.HasAltOpcodes = true;
Res.HasAltOpcodes = true;
if (Res.NeedAnalysis && isOdd(NewOpcodePos))
std::swap(Opcode, AltOpcode);
}
continue;
}
if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) {
if (!Instruction::isBinaryOp(Opcode) ||
!Instruction::isCommutative(Opcode)) {
NewOpcodePos = Cnt;
Opcode = NewOpcode;
AltOpcode = getAltOpcode(Opcode);
Res.NeedAnalysis = true;
}
} else if (tryToRepresentAsInstArg(I->getOpcode(),
cast<Instruction>(VL[NewOpcodePos])))
Res.NeedAnalysis = true;
else
return {};
} }
Res.Opcode = Opcode; Res.Opcode = Opcode;
Res.OpcodePos = NewOpcodePos;
return Res; return Res;
} }
@ -462,20 +421,16 @@ struct InstructionsState {
static InstructionsState getSameOpcode(ArrayRef<Value *> VL) { static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
auto Res = getMainOpcode(VL); auto Res = getMainOpcode(VL);
unsigned Opcode = Res.Opcode; unsigned Opcode = Res.Opcode;
if (!Res.NeedAnalysis && !Res.HasAltOpcodes) if (!Res.HasAltOpcodes)
return InstructionsState(VL[Res.OpcodePos], Opcode, false); return InstructionsState(VL[0], Opcode, false);
auto *OpInst = cast<Instruction>(VL[Res.OpcodePos]); auto *OpInst = cast<Instruction>(VL[0]);
unsigned AltOpcode = getAltOpcode(Opcode); unsigned AltOpcode = getAltOpcode(Opcode);
// Examine each element in the list instructions VL to determine // Examine each element in the list instructions VL to determine
// if some operations there could be considered as an alternative // if some operations there could be considered as an alternative
// (for example as subtraction relates to addition operation) or // (for example as subtraction relates to addition operation).
// operation could be an operand of a (possibly) binary operation.
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
auto *I = cast<Instruction>(VL[Cnt]); auto *I = cast<Instruction>(VL[Cnt]);
unsigned InstOpcode = I->getOpcode(); unsigned InstOpcode = I->getOpcode();
if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode))
if (tryToRepresentAsInstArg(InstOpcode, OpInst))
InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode : Opcode;
if ((Res.HasAltOpcodes && if ((Res.HasAltOpcodes &&
InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
(!Res.HasAltOpcodes && InstOpcode != Opcode)) { (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
@ -628,7 +583,6 @@ public:
void deleteTree() { void deleteTree() {
VectorizableTree.clear(); VectorizableTree.clear();
ScalarToTreeEntry.clear(); ScalarToTreeEntry.clear();
ExtraScalarToTreeEntry.clear();
MustGather.clear(); MustGather.clear();
ExternalUses.clear(); ExternalUses.clear();
NumLoadsWantToKeepOrder = 0; NumLoadsWantToKeepOrder = 0;
@ -768,40 +722,22 @@ private:
/// The TreeEntry index containing the user of this entry. We can actually /// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree. /// have multiple users so the data structure is not truly a tree.
SmallVector<int, 1> UserTreeIndices; SmallVector<int, 1> UserTreeIndices;
/// Info about instruction in this tree entry.
InstructionsState State;
}; };
/// Create a new VectorizableTree entry. /// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
int &UserTreeIdx, const InstructionsState &S) { int &UserTreeIdx) {
assert((!Vectorized || S.Opcode != 0) &&
"Vectorized TreeEntry without opcode");
VectorizableTree.emplace_back(VectorizableTree); VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1; int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx]; TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized; Last->NeedToGather = !Vectorized;
if (Vectorized) { if (Vectorized) {
Last->State = S;
unsigned AltOpcode = getAltOpcode(S.Opcode);
for (int i = 0, e = VL.size(); i != e; ++i) { for (int i = 0, e = VL.size(); i != e; ++i) {
unsigned RealOpcode = assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
(S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode; ScalarToTreeEntry[VL[i]] = idx;
Value *Key = (cast<Instruction>(VL[i])->getOpcode() == RealOpcode)
? VL[i]
: S.OpValue;
assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!");
if (VL[i] == Key)
ScalarToTreeEntry[Key] = idx;
else
ExtraScalarToTreeEntry[VL[i]][Key] = idx;
} }
} else { } else {
Last->State.Opcode = 0;
Last->State.OpValue = VL[0];
Last->State.IsAltShuffle = false;
MustGather.insert(VL.begin(), VL.end()); MustGather.insert(VL.begin(), VL.end());
} }
@ -829,24 +765,8 @@ private:
return nullptr; return nullptr;
} }
TreeEntry *getTreeEntry(Value *V, Value *OpValue) {
if (V == OpValue)
return getTreeEntry(V);
auto I = ExtraScalarToTreeEntry.find(V);
if (I != ExtraScalarToTreeEntry.end()) {
auto &STT = I->second;
auto STTI = STT.find(OpValue);
if (STTI != STT.end())
return &VectorizableTree[STTI->second];
}
return nullptr;
}
/// Maps a specific scalar to its tree entry. /// Maps a specific scalar to its tree entry.
SmallDenseMap<Value *, int> ScalarToTreeEntry; SmallDenseMap<Value*, int> ScalarToTreeEntry;
/// Maps a specific scalar to its tree entry(s) with leading scalar.
SmallDenseMap<Value *, SmallDenseMap<Value *, int>> ExtraScalarToTreeEntry;
/// A list of scalars that we found that we need to keep as scalars. /// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather; ValueSet MustGather;
@ -1418,15 +1338,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
continue; continue;
// For each lane: // For each lane:
const unsigned Opcode = Entry->State.Opcode;
const unsigned AltOpcode = getAltOpcode(Opcode);
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane]; Value *Scalar = Entry->Scalars[Lane];
if (!sameOpcodeOrAlt(Opcode, AltOpcode,
cast<Instruction>(Scalar)->getOpcode()))
continue;
// Check if the scalar is externally used as an extra arg. // Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar); auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) { if (ExtI != ExternallyUsedValues.end()) {
@ -1469,37 +1383,6 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
} }
} }
static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
switch(Opcode) {
case Instruction::Add:
case Instruction::Sub:
case Instruction::Or:
case Instruction::Xor:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
return ConstantInt::getNullValue(Ty);
case Instruction::Mul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
return ConstantInt::get(Ty, /*V=*/1);
case Instruction::FAdd:
case Instruction::FSub:
return ConstantFP::get(Ty, /*V=*/0.0);
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
return ConstantFP::get(Ty, /*V=*/1.0);
case Instruction::And:
return ConstantInt::getAllOnesValue(Ty);
default:
break;
}
llvm_unreachable("unknown binop for default constant value");
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
int UserTreeIdx) { int UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@ -1507,46 +1390,31 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
InstructionsState S = getSameOpcode(VL); InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) { if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
// Don't handle vectors. // Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) { if (S.OpValue->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) { if (SI->getValueOperand()->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
// If all of the operands are identical or constant we have a simple solution. // If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
// Avoid any vectors that are wider than two elements and
// with real operations less than or equal to half of vector
// to others members are operands to that operations.
unsigned AltOpcode = getAltOpcode(S.Opcode);
unsigned SameOrAlt = 0;
if (VL.size() > 2) {
for (Value *V : VL) {
auto *Instr = cast<Instruction>(V);
if (sameOpcodeOrAlt(S.Opcode, AltOpcode, Instr->getOpcode()))
SameOrAlt++;
}
if (SameOrAlt <= (VL.size() / 2))
return;
}
// We now know that this is a vector of instructions of the same type from // We now know that this is a vector of instructions of the same type from
// the same block. // the same block.
@ -1555,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (EphValues.count(VL[i])) { if (EphValues.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is ephemeral.\n"); ") is ephemeral.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1566,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) { if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1585,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (getTreeEntry(I)) { if (getTreeEntry(I)) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is already in tree.\n"); ") is already in tree.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1595,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size(); i != e; ++i) { for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) { if (MustGather.count(VL[i])) {
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1609,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't go into unreachable blocks. They may contain instructions with // Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling. // dependency cycles which confuse the final scheduling.
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
@ -1618,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned j = i + 1; j < e; ++j) for (unsigned j = i + 1; j < e; ++j)
if (VL[i] == VL[j]) { if (VL[i] == VL[j]) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
@ -1633,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
assert((!BS.getScheduleData(VL0) || assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) && !BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure"); "tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@ -1652,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Term) { if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@ -1679,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} else { } else {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
} }
newTreeEntry(VL, Reuse, UserTreeIdx, S); newTreeEntry(VL, Reuse, UserTreeIdx);
return; return;
} }
case Instruction::Load: { case Instruction::Load: {
@ -1694,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (DL->getTypeSizeInBits(ScalarTy) != if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) { DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return; return;
} }
@ -1705,7 +1573,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LoadInst *L = cast<LoadInst>(VL[i]); LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) { if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return; return;
} }
@ -1727,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Consecutive) { if (Consecutive) {
++NumLoadsWantToKeepOrder; ++NumLoadsWantToKeepOrder;
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of loads.\n"); DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return; return;
} }
@ -1742,7 +1610,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
} }
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
if (ReverseConsecutive) { if (ReverseConsecutive) {
++NumLoadsWantToChangeOrder; ++NumLoadsWantToChangeOrder;
@ -1769,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) { if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of casts.\n"); DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@ -1797,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Cmp->getPredicate() != P0 || if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) { Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of compares.\n"); DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@ -1835,7 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::And: case Instruction::And:
case Instruction::Or: case Instruction::Or:
case Instruction::Xor: case Instruction::Xor:
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to // Sort operands of the instructions so that each side is more likely to
@ -1851,21 +1719,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands; ValueList Operands;
// Prepare the operand vector. // Prepare the operand vector.
for (Value *VecOp : VL) { for (Value *j : VL)
auto *I = cast<Instruction>(VecOp); Operands.push_back(cast<Instruction>(j)->getOperand(i));
if (I->getOpcode() == S.Opcode) {
Operands.push_back(I->getOperand(i)); buildTree_rec(Operands, Depth + 1, UserTreeIdx);
continue;
}
assert(Instruction::isBinaryOp(S.Opcode) &&
"Expected a binary operation.");
Value *Operand = isOdd(i)
? getDefaultConstantForOpcode(S.Opcode, I->getType())
: VecOp;
Operands.push_back(Operand);
}
if (allSameType(Operands))
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
} }
return; return;
@ -1875,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (cast<Instruction>(VL[j])->getNumOperands() != 2) { if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1888,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Ty0 != CurTy) { if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
@ -1900,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
DEBUG( DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) { for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands; ValueList Operands;
@ -1922,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return; return;
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a vector of stores.\n"); DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands; ValueList Operands;
@ -1945,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) { if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return; return;
} }
@ -1959,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
getVectorIntrinsicIDForCall(CI2, TLI) != ID || getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) { !CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n"); << "\n");
return; return;
@ -1970,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *A1J = CI2->getArgOperand(1); Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) { if (A1I != A1J) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J << " argument "<< A1I<<"!=" << A1J
<< "\n"); << "\n");
@ -1983,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CI->op_begin() + CI->getBundleOperandsEndIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n'); << *VL[i] << '\n');
return; return;
} }
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands; ValueList Operands;
// Prepare the operand vector. // Prepare the operand vector.
@ -2007,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// then do not vectorize this instruction. // then do not vectorize this instruction.
if (!S.IsAltShuffle) { if (!S.IsAltShuffle) {
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return; return;
} }
newTreeEntry(VL, true, UserTreeIdx, S); newTreeEntry(VL, true, UserTreeIdx);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization. // Reorder operands if reordering would enable vectorization.
@ -2026,19 +1883,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands; ValueList Operands;
// Prepare the operand vector. // Prepare the operand vector.
for (Value *VecOp : VL) { for (Value *j : VL)
auto *I = cast<Instruction>(VecOp); Operands.push_back(cast<Instruction>(j)->getOperand(i));
if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) {
Operands.push_back(I->getOperand(i));
continue;
}
assert(Instruction::isBinaryOp(S.Opcode) &&
"Expected a binary operation.");
Value *Operand = isOdd(i)
? getDefaultConstantForOpcode(S.Opcode, I->getType())
: VecOp;
Operands.push_back(Operand);
}
buildTree_rec(Operands, Depth + 1, UserTreeIdx); buildTree_rec(Operands, Depth + 1, UserTreeIdx);
} }
@ -2046,7 +1892,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
default: default:
BS.cancelScheduling(VL, VL0); BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, S); newTreeEntry(VL, false, UserTreeIdx);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return; return;
} }
@ -2167,17 +2013,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
} }
return getGatherCost(E->Scalars); return getGatherCost(E->Scalars);
} }
assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); InstructionsState S = getSameOpcode(VL);
auto *VL0 = cast<Instruction>(E->State.OpValue); assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
unsigned ShuffleOrOp = E->State.IsAltShuffle ? Instruction *VL0 = cast<Instruction>(S.OpValue);
(unsigned) Instruction::ShuffleVector : E->State.Opcode; unsigned ShuffleOrOp = S.IsAltShuffle ?
(unsigned) Instruction::ShuffleVector : S.Opcode;
switch (ShuffleOrOp) { switch (ShuffleOrOp) {
case Instruction::PHI: case Instruction::PHI:
return 0; return 0;
case Instruction::ExtractValue: case Instruction::ExtractValue:
case Instruction::ExtractElement: case Instruction::ExtractElement:
if (canReuseExtract(VL, E->State.OpValue)) { if (canReuseExtract(VL, S.OpValue)) {
int DeadCost = 0; int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) { for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]); Instruction *E = cast<Instruction>(VL[i]);
@ -2221,8 +2068,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction. // Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * int ScalarCost = VecTy->getNumElements() *
TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy, Builder.getInt1Ty(), VL0); TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy, MaskTy, VL0); int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
return VecCost - ScalarCost; return VecCost - ScalarCost;
} }
case Instruction::Add: case Instruction::Add:
@ -2248,7 +2095,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OK_UniformConstantValue;
TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None; TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP = TargetTransformInfo::OperandValueProperties Op2VP =
@ -2259,33 +2106,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// If instead not all operands are constants, then set the operand kind // If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same, // to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue. // then set the operand kind to OK_NonUniformConstantValue.
if (auto *CInt = dyn_cast<ConstantInt>(VL0->getOperand(1))) { ConstantInt *CInt = nullptr;
Op2VK = TargetTransformInfo::OK_UniformConstantValue; for (unsigned i = 0; i < VL.size(); ++i) {
const unsigned Opcode = E->State.Opcode; const Instruction *I = cast<Instruction>(VL[i]);
for (auto *V : VL) { if (!isa<ConstantInt>(I->getOperand(1))) {
auto *I = cast<Instruction>(V); Op2VK = TargetTransformInfo::OK_AnyValue;
if (I == VL0 || Opcode != I->getOpcode()) break;
continue; }
if (!isa<ConstantInt>(I->getOperand(1))) { if (i == 0) {
Op2VK = TargetTransformInfo::OK_AnyValue; CInt = cast<ConstantInt>(I->getOperand(1));
break; continue;
}
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
CInt != cast<ConstantInt>(I->getOperand(1)))
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
} }
// FIXME: Currently cost of model modification for division by power of
// 2 is handled for X86 and AArch64. Add support for other targets.
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
CInt->getValue().isPowerOf2()) CInt != cast<ConstantInt>(I->getOperand(1)))
Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
} }
// FIXME: Currently cost of model modification for division by power of
// 2 is handled for X86 and AArch64. Add support for other targets.
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
int ScalarCost = VecTy->getNumElements() * SmallVector<const Value *, 4> Operands(VL0->operand_values());
TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, int ScalarCost =
Op1VK, Op2VK, Op1VP, Op2VP); VecTy->getNumElements() *
int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
Op2VK, Op1VP, Op2VP); Op2VP, Operands);
int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP, Operands);
return VecCost - ScalarCost; return VecCost - ScalarCost;
} }
case Instruction::GetElementPtr: { case Instruction::GetElementPtr: {
@ -2351,18 +2199,23 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OK_AnyValue;
unsigned AltOpcode = getAltOpcode(E->State.Opcode); int ScalarCost = 0;
int ScalarCost = int VecCost = 0;
TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, Op1VK, Op2VK) * for (Value *i : VL) {
VL.size() / 2; Instruction *I = cast<Instruction>(i);
ScalarCost += if (!I)
TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK, Op2VK) * break;
VL.size() / 2; ScalarCost +=
TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
}
// VecCost is equal to sum of the cost of creating 2 vectors // VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle. // and the cost of creating shuffle.
int VecCost = Instruction *I0 = cast<Instruction>(VL[0]);
TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, Op2VK); VecCost =
VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy, Op1VK, Op2VK); TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
Instruction *I1 = cast<Instruction>(VL[1]);
VecCost +=
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
VecCost += VecCost +=
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
return VecCost - ScalarCost; return VecCost - ScalarCost;
@ -2428,7 +2281,7 @@ int BoUpSLP::getSpillCost() {
Instruction *PrevInst = nullptr; Instruction *PrevInst = nullptr;
for (const auto &N : VectorizableTree) { for (const auto &N : VectorizableTree) {
Instruction *Inst = dyn_cast<Instruction>(N.State.OpValue); Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
if (!Inst) if (!Inst)
continue; continue;
@ -2488,7 +2341,7 @@ int BoUpSLP::getTreeCost() {
for (TreeEntry &TE : VectorizableTree) { for (TreeEntry &TE : VectorizableTree) {
int C = getEntryCost(&TE); int C = getEntryCost(&TE);
DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
<< *TE.State.OpValue << ".\n"); << *TE.Scalars[0] << ".\n");
Cost += C; Cost += C;
} }
@ -2509,7 +2362,7 @@ int BoUpSLP::getTreeCost() {
// extend the extracted value back to the original type. Here, we account // extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed. // for the extract and the added cost of the sign extend if needed.
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0].State.OpValue; auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) { if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend = auto Extend =
@ -2572,15 +2425,13 @@ void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Right) { SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right // Push left and right operands of binary operation into Left and Right
unsigned AltOpcode = getAltOpcode(Opcode); unsigned AltOpcode = getAltOpcode(Opcode);
(void)AltOpcode;
for (Value *V : VL) { for (Value *V : VL) {
auto *I = cast<Instruction>(V); auto *I = cast<Instruction>(V);
if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
Left.push_back(I->getOperand(0)); "Incorrect instruction in vector");
Right.push_back(I->getOperand(1)); Left.push_back(I->getOperand(0));
} else { Right.push_back(I->getOperand(1));
Left.push_back(I);
Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType()));
}
} }
// Reorder if we have a commutative operation and consecutive access // Reorder if we have a commutative operation and consecutive access
@ -2629,13 +2480,8 @@ static bool shouldReorderOperands(
int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left, int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
if (I.getOpcode() == Opcode) { VLeft = I.getOperand(0);
VLeft = I.getOperand(0); VRight = I.getOperand(1);
VRight = I.getOperand(1);
} else {
VLeft = &I;
VRight = getDefaultConstantForOpcode(Opcode, I.getType());
}
// If we have "SplatRight", try to see if commuting is needed to preserve it. // If we have "SplatRight", try to see if commuting is needed to preserve it.
if (SplatRight) { if (SplatRight) {
if (VRight == Right[i - 1]) if (VRight == Right[i - 1])
@ -2699,15 +2545,8 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
// Peel the first iteration out of the loop since there's nothing // Peel the first iteration out of the loop since there's nothing
// interesting to do anyway and it simplifies the checks in the loop. // interesting to do anyway and it simplifies the checks in the loop.
auto *I = cast<Instruction>(VL[0]); auto *I = cast<Instruction>(VL[0]);
Value *VLeft; Value *VLeft = I->getOperand(0);
Value *VRight; Value *VRight = I->getOperand(1);
if (I->getOpcode() == Opcode) {
VLeft = I->getOperand(0);
VRight = I->getOperand(1);
} else {
VLeft = I;
VRight = getDefaultConstantForOpcode(Opcode, I->getType());
}
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
// Favor having instruction to the right. FIXME: why? // Favor having instruction to the right. FIXME: why?
std::swap(VLeft, VRight); std::swap(VLeft, VRight);
@ -2912,11 +2751,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder); IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) { if (E->VectorizedValue) {
DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue << ".\n"); DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue; return E->VectorizedValue;
} }
Instruction *VL0 = cast<Instruction>(E->State.OpValue); InstructionsState S = getSameOpcode(E->Scalars);
Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
Type *ScalarTy = VL0->getType(); Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType(); ScalarTy = SI->getValueOperand()->getType();
@ -2929,8 +2769,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V; return V;
} }
unsigned ShuffleOrOp = E->State.IsAltShuffle ? unsigned ShuffleOrOp = S.IsAltShuffle ?
(unsigned) Instruction::ShuffleVector : E->State.Opcode; (unsigned) Instruction::ShuffleVector : S.Opcode;
switch (ShuffleOrOp) { switch (ShuffleOrOp) {
case Instruction::PHI: { case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0); PHINode *PH = dyn_cast<PHINode>(VL0);
@ -3040,7 +2880,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V; Value *V;
if (E->State.Opcode == Instruction::FCmp) if (S.Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R); V = Builder.CreateFCmp(P0, L, R);
else else
V = Builder.CreateICmp(P0, L, R); V = Builder.CreateICmp(P0, L, R);
@ -3092,19 +2932,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Xor: { case Instruction::Xor: {
ValueList LHSVL, RHSVL; ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars, LHSVL, reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
RHSVL); RHSVL);
else else
for (Value *V : E->Scalars) { for (Value *V : E->Scalars) {
auto *I = cast<Instruction>(V); auto *I = cast<Instruction>(V);
if (I->getOpcode() == E->State.Opcode) { LHSVL.push_back(I->getOperand(0));
LHSVL.push_back(I->getOperand(0)); RHSVL.push_back(I->getOperand(1));
RHSVL.push_back(I->getOperand(1));
} else {
LHSVL.push_back(V);
RHSVL.push_back(
getDefaultConstantForOpcode(E->State.Opcode, I->getType()));
}
} }
setInsertPointAfterBundle(E->Scalars, VL0); setInsertPointAfterBundle(E->Scalars, VL0);
@ -3116,7 +2950,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V; return V;
Value *V = Builder.CreateBinOp( Value *V = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->State.Opcode), LHS, RHS); static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
E->VectorizedValue = V; E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
++NumVectorInstructions; ++NumVectorInstructions;
@ -3266,9 +3100,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
} }
case Instruction::ShuffleVector: { case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL; ValueList LHSVL, RHSVL;
assert(Instruction::isBinaryOp(E->State.Opcode) && assert(Instruction::isBinaryOp(S.Opcode) &&
"Invalid Shuffle Vector Operand"); "Invalid Shuffle Vector Operand");
reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL, RHSVL); reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
setInsertPointAfterBundle(E->Scalars, VL0); setInsertPointAfterBundle(E->Scalars, VL0);
Value *LHS = vectorizeTree(LHSVL); Value *LHS = vectorizeTree(LHSVL);
@ -3279,9 +3113,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// Create a vector of LHS op1 RHS // Create a vector of LHS op1 RHS
Value *V0 = Builder.CreateBinOp( Value *V0 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->State.Opcode), LHS, RHS); static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
unsigned AltOpcode = getAltOpcode(E->State.Opcode); unsigned AltOpcode = getAltOpcode(S.Opcode);
// Create a vector of LHS op2 RHS // Create a vector of LHS op2 RHS
Value *V1 = Builder.CreateBinOp( Value *V1 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS); static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
@ -3303,13 +3137,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
} }
Value *ShuffleMask = ConstantVector::get(Mask); Value *ShuffleMask = ConstantVector::get(Mask);
InstructionsState S = getSameOpcode(EvenScalars); propagateIRFlags(V0, EvenScalars);
assert(!S.IsAltShuffle && "Unexpected alternate opcode"); propagateIRFlags(V1, OddScalars);
propagateIRFlags(V0, EvenScalars, S.OpValue);
S = getSameOpcode(OddScalars);
assert(!S.IsAltShuffle && "Unexpected alternate opcode");
propagateIRFlags(V1, OddScalars, S.OpValue);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
E->VectorizedValue = V; E->VectorizedValue = V;
@ -3343,7 +3172,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// If the vectorized tree can be rewritten in a smaller type, we truncate the // If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We // vectorized root. InstCombine will then rewrite the entire expression. We
// sign extend the extracted values below. // sign extend the extracted values below.
auto *ScalarRoot = VectorizableTree[0].State.OpValue; auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) { if (MinBWs.count(ScalarRoot)) {
if (auto *I = dyn_cast<Instruction>(VectorRoot)) if (auto *I = dyn_cast<Instruction>(VectorRoot))
Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
@ -3454,15 +3283,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(Entry->VectorizedValue && "Can't find vectorizable value"); assert(Entry->VectorizedValue && "Can't find vectorizable value");
// For each lane: // For each lane:
const unsigned Opcode = Entry->State.Opcode;
const unsigned AltOpcode = getAltOpcode(Opcode);
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane]; Value *Scalar = Entry->Scalars[Lane];
if (!sameOpcodeOrAlt(Opcode, AltOpcode,
cast<Instruction>(Scalar)->getOpcode()))
continue;
Type *Ty = Scalar->getType(); Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) { if (!Ty->isVoidTy()) {
#ifndef NDEBUG #ifndef NDEBUG
@ -3594,7 +3417,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
} }
for (Value *V : VL) { for (Value *V : VL) {
ScheduleData *BundleMember = getScheduleData(V, isOneOf(OpValue, V)); ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember && assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)"); "no ScheduleData for bundle member (maybe not in same basic block)");
if (BundleMember->IsScheduled) { if (BundleMember->IsScheduled) {
@ -3667,7 +3490,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
if (isa<PHINode>(OpValue)) if (isa<PHINode>(OpValue))
return; return;
ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle; ScheduleData *Bundle = getScheduleData(OpValue);
DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled && assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled"); "Can't cancel bundle which is already scheduled");
@ -3972,7 +3795,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
I = I->getNextNode()) { I = I->getNextNode()) {
BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
assert(SD->isPartOfBundle() == assert(SD->isPartOfBundle() ==
(getTreeEntry(SD->Inst, SD->OpValue) != nullptr) && (getTreeEntry(SD->Inst) != nullptr) &&
"scheduler and vectorizer bundle mismatch"); "scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++; SD->FirstInBundle->SchedulingPriority = Idx++;
if (SD->isSchedulingEntity()) { if (SD->isSchedulingEntity()) {
@ -3995,13 +3818,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
ScheduleData *BundleMember = picked; ScheduleData *BundleMember = picked;
while (BundleMember) { while (BundleMember) {
Instruction *pickedInst = BundleMember->Inst; Instruction *pickedInst = BundleMember->Inst;
if (pickedInst == BundleMember->OpValue) { if (LastScheduledInst->getNextNode() != pickedInst) {
if (LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst);
BS->BB->getInstList().remove(pickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst); pickedInst);
}
LastScheduledInst = pickedInst;
} }
LastScheduledInst = pickedInst;
BundleMember = BundleMember->NextInBundle; BundleMember = BundleMember->NextInBundle;
} }

View File

@ -1,52 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -S < %s | FileCheck %s
@bar = external global [4 x [4 x i32]], align 4
@dct_luma = external global [4 x [4 x i32]], align 4
define void @foo() local_unnamed_addr {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef
; CHECK-NEXT: store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
; CHECK-NEXT: [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
; CHECK-NEXT: [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 2
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 3
; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
; CHECK-NEXT: unreachable
;
entry:
%add277 = add nsw i32 undef, undef
store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
%0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
%sub355 = add nsw i32 undef, %0
%shr.i = ashr i32 %sub355, 6
%arrayidx372 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
store i32 %shr.i, i32* %arrayidx372, align 4
%sub355.1 = add nsw i32 undef, %add277
%shr.i.1 = ashr i32 %sub355.1, 6
%arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
store i32 %shr.i.1, i32* %arrayidx372.1, align 4
%1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
%sub355.2 = add nsw i32 undef, %1
%shr.i.2 = ashr i32 %sub355.2, 6
%arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
store i32 %shr.i.2, i32* %arrayidx372.2, align 4
%2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
%sub355.3 = add nsw i32 undef, %2
%shr.i.3 = ashr i32 %sub355.3, 6
%arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
store i32 %shr.i.3, i32* %arrayidx372.3, align 4
unreachable
}

View File

@ -43,16 +43,22 @@ define void @add1(i32* noalias %dst, i32* noalias %src) {
; CHECK-LABEL: @add1( ; CHECK-LABEL: @add1(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* ; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[TMP1]] ; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* ; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -80,16 +86,22 @@ define void @sub0(i32* noalias %dst, i32* noalias %src) {
; CHECK-LABEL: @sub0( ; CHECK-LABEL: @sub0(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* ; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 0, i32 -2, i32 -3>, [[TMP1]] ; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -193,18 +205,22 @@ define void @addsub0(i32* noalias %dst, i32* noalias %src) {
; CHECK-LABEL: @addsub0( ; CHECK-LABEL: @addsub0(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* ; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3> ; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3> ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -232,18 +248,22 @@ define void @addsub1(i32* noalias %dst, i32* noalias %src) {
; CHECK-LABEL: @addsub1( ; CHECK-LABEL: @addsub1(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* ; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3> ; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3> ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -271,16 +291,22 @@ define void @mul(i32* noalias %dst, i32* noalias %src) {
; CHECK-LABEL: @mul( ; CHECK-LABEL: @mul(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* ; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> <i32 257, i32 -3, i32 1, i32 -9>, [[TMP1]] ; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* ; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -308,16 +334,22 @@ define void @shl0(i32* noalias %dst, i32* noalias %src) {
; CHECK-LABEL: @shl0( ; CHECK-LABEL: @shl0(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* ; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32 1, i32 2, i32 3> ; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* ; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -421,16 +453,22 @@ define void @add1f(float* noalias %dst, float* noalias %src) {
; CHECK-LABEL: @add1f( ; CHECK-LABEL: @add1f(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* ; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]] ; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -458,16 +496,22 @@ define void @sub0f(float* noalias %dst, float* noalias %src) {
; CHECK-LABEL: @sub0f( ; CHECK-LABEL: @sub0f(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* ; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]] ; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -571,18 +615,22 @@ define void @addsub0f(float* noalias %dst, float* noalias %src) {
; CHECK-LABEL: @addsub0f( ; CHECK-LABEL: @addsub0f(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* ; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00> ; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00> ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -610,18 +658,22 @@ define void @addsub1f(float* noalias %dst, float* noalias %src) {
; CHECK-LABEL: @addsub1f( ; CHECK-LABEL: @addsub1f(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* ; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00> ; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00> ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -649,16 +701,22 @@ define void @mulf(float* noalias %dst, float* noalias %src) {
; CHECK-LABEL: @mulf( ; CHECK-LABEL: @mulf(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* ; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>, [[TMP1]] ; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* ; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry:
@ -767,16 +825,22 @@ define void @sub0fn(float* noalias %dst, float* noalias %src) {
; CHECK-LABEL: @sub0fn( ; CHECK-LABEL: @sub0fn(
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* ; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]] ; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
entry: entry: