1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

[SystemZ] Return scalarized costs for vector instructions on older archs.

A cost query for a vector instruction should return a cost even without
target vector support, and not trigger an assert.

VectorCombine does this with an input containing source code vectors.

Review: Ulrich Weigand
This commit is contained in:
Jonas Paulsson 2020-02-20 10:40:30 -08:00
parent f6d46e3d59
commit 035c4568cc
2 changed files with 142 additions and 132 deletions

View File

@ -391,9 +391,57 @@ int SystemZTTIImpl::getArithmeticInstrCost(
}
}
if (Ty->isVectorTy()) {
assert(ST->hasVector() &&
"getArithmeticInstrCost() called with vector type.");
if (!Ty->isVectorTy()) {
// These FP operations are supported with a dedicated instruction for
// float, double and fp128 (base implementation assumes float generally
// costs 2).
if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
return 1;
// There is no native support for FRem.
if (Opcode == Instruction::FRem)
return LIBCALL_COST;
// Give discount for some combined logical operations if supported.
if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
if (Opcode == Instruction::Xor) {
for (const Value *A : Args) {
if (const Instruction *I = dyn_cast<Instruction>(A))
if (I->hasOneUse() &&
(I->getOpcode() == Instruction::And ||
I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::Xor))
return 0;
}
}
else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
for (const Value *A : Args) {
if (const Instruction *I = dyn_cast<Instruction>(A))
if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
return 0;
}
}
}
// Or requires one instruction, although it has custom handling for i64.
if (Opcode == Instruction::Or)
return 1;
if (Opcode == Instruction::Xor && ScalarBits == 1) {
if (ST->hasLoadStoreOnCond2())
return 5; // 2 * (li 0; loc 1); xor
return 7; // 2 * ipm sequences ; xor ; shift ; compare
}
if (DivRemConstPow2)
return (SignedDivRem ? SDivPow2Cost : 1);
if (DivRemConst)
return DivMulSeqCost;
if (SignedDivRem || UnsignedDivRem)
return DivInstrCost;
}
else if (ST->hasVector()) {
unsigned VF = Ty->getVectorNumElements();
unsigned NumVectors = getNumVectorRegs(Ty);
@ -454,56 +502,6 @@ int SystemZTTIImpl::getArithmeticInstrCost(
return Cost;
}
}
else { // Scalar:
// These FP operations are supported with a dedicated instruction for
// float, double and fp128 (base implementation assumes float generally
// costs 2).
if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
return 1;
// There is no native support for FRem.
if (Opcode == Instruction::FRem)
return LIBCALL_COST;
// Give discount for some combined logical operations if supported.
if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
if (Opcode == Instruction::Xor) {
for (const Value *A : Args) {
if (const Instruction *I = dyn_cast<Instruction>(A))
if (I->hasOneUse() &&
(I->getOpcode() == Instruction::And ||
I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::Xor))
return 0;
}
}
else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
for (const Value *A : Args) {
if (const Instruction *I = dyn_cast<Instruction>(A))
if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
return 0;
}
}
}
// Or requires one instruction, although it has custom handling for i64.
if (Opcode == Instruction::Or)
return 1;
if (Opcode == Instruction::Xor && ScalarBits == 1) {
if (ST->hasLoadStoreOnCond2())
return 5; // 2 * (li 0; loc 1); xor
return 7; // 2 * ipm sequences ; xor ; shift ; compare
}
if (DivRemConstPow2)
return (SignedDivRem ? SDivPow2Cost : 1);
if (DivRemConst)
return DivMulSeqCost;
if (SignedDivRem || UnsignedDivRem)
return DivInstrCost;
}
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
@ -513,35 +511,36 @@ int SystemZTTIImpl::getArithmeticInstrCost(
int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
assert (Tp->isVectorTy());
assert (ST->hasVector() && "getShuffleCost() called.");
unsigned NumVectors = getNumVectorRegs(Tp);
if (ST->hasVector()) {
unsigned NumVectors = getNumVectorRegs(Tp);
// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
// FP128 values are always in scalar registers, so there is no work
// involved with a shuffle, except for broadcast. In that case register
// moves are done with a single instruction per element.
if (Tp->getScalarType()->isFP128Ty())
return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
// FP128 values are always in scalar registers, so there is no work
// involved with a shuffle, except for broadcast. In that case register
// moves are done with a single instruction per element.
if (Tp->getScalarType()->isFP128Ty())
return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
switch (Kind) {
case TargetTransformInfo::SK_ExtractSubvector:
// ExtractSubvector Index indicates start offset.
switch (Kind) {
case TargetTransformInfo::SK_ExtractSubvector:
// ExtractSubvector Index indicates start offset.
// Extracting a subvector from first index is a noop.
return (Index == 0 ? 0 : NumVectors);
// Extracting a subvector from first index is a noop.
return (Index == 0 ? 0 : NumVectors);
case TargetTransformInfo::SK_Broadcast:
// Loop vectorizer calls here to figure out the extra cost of
// broadcasting a loaded value to all elements of a vector. Since vlrep
// loads and replicates with a single instruction, adjust the returned
// value.
return NumVectors - 1;
case TargetTransformInfo::SK_Broadcast:
// Loop vectorizer calls here to figure out the extra cost of
// broadcasting a loaded value to all elements of a vector. Since vlrep
// loads and replicates with a single instruction, adjust the returned
// value.
return NumVectors - 1;
default:
default:
// SystemZ supports single instruction permutation / replication.
return NumVectors;
// SystemZ supports single instruction permutation / replication.
return NumVectors;
}
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@ -672,8 +671,36 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
unsigned DstScalarBits = Dst->getScalarSizeInBits();
unsigned SrcScalarBits = Src->getScalarSizeInBits();
if (Src->isVectorTy()) {
assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
if (!Src->isVectorTy()) {
assert (!Dst->isVectorTy());
if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
if (SrcScalarBits >= 32 ||
(I != nullptr && isa<LoadInst>(I->getOperand(0))))
return 1;
return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
}
if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
Src->isIntegerTy(1)) {
if (ST->hasLoadStoreOnCond2())
return 2; // li 0; loc 1
// This should be extension of a compare i1 result, which is done with
// ipm and a varying sequence of instructions.
unsigned Cost = 0;
if (Opcode == Instruction::SExt)
Cost = (DstScalarBits < 64 ? 3 : 4);
if (Opcode == Instruction::ZExt)
Cost = 3;
Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
// If operands of an fp-type was compared, this costs +1.
Cost++;
return Cost;
}
}
else if (ST->hasVector()) {
assert (Dst->isVectorTy());
unsigned VF = Src->getVectorNumElements();
unsigned NumDstVectors = getNumVectorRegs(Dst);
@ -759,35 +786,6 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return VF + getScalarizationOverhead(Src, false, true);
}
}
else { // Scalar
assert (!Dst->isVectorTy());
if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
if (SrcScalarBits >= 32 ||
(I != nullptr && isa<LoadInst>(I->getOperand(0))))
return 1;
return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
}
if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
Src->isIntegerTy(1)) {
if (ST->hasLoadStoreOnCond2())
return 2; // li 0; loc 1
// This should be extension of a compare i1 result, which is done with
// ipm and a varying sequence of instructions.
unsigned Cost = 0;
if (Opcode == Instruction::SExt)
Cost = (DstScalarBits < 64 ? 3 : 4);
if (Opcode == Instruction::ZExt)
Cost = 3;
Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
// If operands of an fp-type was compared, this costs +1.
Cost++;
return Cost;
}
}
return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
}
@ -806,8 +804,31 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
Type *CondTy, const Instruction *I) {
if (ValTy->isVectorTy()) {
assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
if (!ValTy->isVectorTy()) {
switch (Opcode) {
case Instruction::ICmp: {
// A loaded value compared with 0 with multiple users becomes Load and
// Test. The load is then not foldable, so return 0 cost for the ICmp.
unsigned ScalarBits = ValTy->getScalarSizeInBits();
if (I != nullptr && ScalarBits >= 32)
if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
C->getZExtValue() == 0)
return 0;
unsigned Cost = 1;
if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
return Cost;
}
case Instruction::Select:
if (ValTy->isFloatingPointTy())
return 4; // No load on condition for FP - costs a conditional jump.
return 1; // Load On Condition / Select Register.
}
}
else if (ST->hasVector()) {
unsigned VF = ValTy->getVectorNumElements();
// Called with a compare instruction.
@ -856,30 +877,6 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
}
}
else { // Scalar
switch (Opcode) {
case Instruction::ICmp: {
// A loaded value compared with 0 with multiple users becomes Load and
// Test. The load is then not foldable, so return 0 cost for the ICmp.
unsigned ScalarBits = ValTy->getScalarSizeInBits();
if (I != nullptr && ScalarBits >= 32)
if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
C->getZExtValue() == 0)
return 0;
unsigned Cost = 1;
if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
return Cost;
}
case Instruction::Select:
if (ValTy->isFloatingPointTy())
return 4; // No load on condition for FP - costs a conditional jump.
return 1; // Load On Condition / Select Register.
}
}
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
}

View File

@ -0,0 +1,13 @@
; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z10
;
; Check that some costs can be returned for vector instructions also without
; vector support.
define void @fun(<2 x double>* %arg) {
entry:
%add = fadd <2 x double> undef, undef
shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
%conv = fptoui <4 x float> undef to <4 x i32>
%cmp = icmp eq <2 x i64> undef, undef
ret void
}