1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[AMDGPU][CostModel] Refine cost model for half- and quarter-rate instructions.

1. Throughput and codesize costs estimations was separated and updated.
2. Updated fdiv cost estimation for different cases.
3. Added scalarization processing for types that are treated as !isSimple() to
improve codesize estimation in getArithmeticInstrCost() and
getArithmeticInstrCost(). The code was borrowed from TCK_RecipThroughput path
of base implementation.

Next step is unify scalarization part in base class that is currently works for
TCK_RecipThroughput path only.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D89973
This commit is contained in:
dfukalov 2020-10-22 19:38:56 +03:00
parent 2c7b4a47cb
commit efaecfc60e
10 changed files with 417 additions and 270 deletions

View File

@ -472,9 +472,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: We're having to query the throughput cost so that the basic
// implementation tries to generate legalize and scalarization costs. Maybe
// we could hoist the scalarization code here?
return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
Opd1Info, Opd2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
if (CostKind != TTI::TCK_CodeSize)
return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
Opd1Info, Opd2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
// Scalarization
// Check if any of the operands are vector operands.
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
bool IsFloat = Ty->isFPOrFPVectorTy();
// Assume that floating point arithmetic operations cost twice as much as
// integer operations.
unsigned OpCost = (IsFloat ? 2 : 1);
if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
// The operation is legal. Assume it costs 1.
// TODO: Once we have extract/insert subvector cost we need to use them.
return LT.first * OpCost;
}
if (!TLI->isOperationExpand(ISD, LT.second)) {
// If the operation is custom lowered, then assume that the code is twice
// as expensive.
return LT.first * 2 * OpCost;
}
// Else, assume that we need to scalarize this op.
// TODO: If one of the types get legalized by splitting, handle this
// similarly to what getCastInstrCost() does.
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
unsigned Cost = getArithmeticInstrCost(
Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
return getScalarizationOverhead(VTy, Args) + Num * Cost;
}
// We don't know anything about this scalar instruction.
return OpCost;
}
// Legalize the type.
@ -493,7 +534,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
case ISD::SRL:
case ISD::SRA:
if (SLT == MVT::i64)
return get64BitInstrCost() * LT.first * NElts;
return get64BitInstrCost(CostKind) * LT.first * NElts;
if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
@ -515,7 +556,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * NElts * getFullRateInstrCost();
case ISD::MUL: {
const int QuarterRateCost = getQuarterRateInstrCost();
const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
if (SLT == MVT::i64) {
const int FullRateCost = getFullRateInstrCost();
return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
@ -552,7 +593,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
case ISD::FADD:
case ISD::FSUB:
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost();
return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
@ -565,7 +606,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: frem should be handled separately. The fdiv in it is most of it,
// but the current lowering is also not entirely correct.
if (SLT == MVT::f64) {
int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
int Cost = 7 * get64BitInstrCost(CostKind) +
getQuarterRateInstrCost(CostKind) +
3 * getHalfRateInstrCost(CostKind);
// Add cost of workaround.
if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
@ -577,7 +620,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// TODO: This is more complicated, unsafe flags etc.
if ((SLT == MVT::f32 && !HasFP32Denormals) ||
(SLT == MVT::f16 && ST->has16BitInsts())) {
return LT.first * getQuarterRateInstrCost() * NElts;
return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
}
}
@ -587,12 +630,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// f32 fmul
// v_cvt_f16_f32
// f16 div_fixup
int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
int Cost =
4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
return LT.first * Cost * NElts;
}
if (SLT == MVT::f32 || SLT == MVT::f16) {
int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
// 4 more v_cvt_* insts without f16 insts support
int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
1 * getQuarterRateInstrCost(CostKind);
if (!HasFP32Denormals) {
// FP mode switches.
@ -642,7 +688,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
Type *RetTy = ICA.getReturnType();
EVT OrigTy = TLI->getValueType(DL, RetTy);
if (!OrigTy.isSimple()) {
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
if (CostKind != TTI::TCK_CodeSize)
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
// TODO: Combine these two logic paths.
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
Type *RetTy = ICA.getReturnType();
unsigned VF = ICA.getVectorFactor();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
: 1);
assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
// Assume that we need to scalarize this intrinsic.
SmallVector<Type *, 4> Types;
for (const Value *Op : Args) {
Type *OpTy = Op->getType();
assert(VF == 1 || !OpTy->isVectorTy());
Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
}
if (VF > 1 && !RetTy->isVoidTy())
RetTy = FixedVectorType::get(RetTy, VF);
// Compute the scalarization overhead based on Args for a vector
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
if (RetVF > 1 || VF > 1) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
}
IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
ScalarizationCost, I);
return getIntrinsicInstrCost(Attrs, CostKind);
}
// Legalize the type.
@ -654,16 +741,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost();
return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
// TODO: Get more refined intrinsic costs?
unsigned InstRate = getQuarterRateInstrCost();
unsigned InstRate = getQuarterRateInstrCost(CostKind);
if (ICA.getID() == Intrinsic::fma) {
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
: getQuarterRateInstrCost();
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
}
return LT.first * NElts * InstRate;
@ -714,7 +801,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
CostKind);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost();
return LT.first * getHalfRateInstrCost(CostKind);
}
int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,

View File

@ -115,21 +115,26 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
return TargetTransformInfo::TCC_Basic;
}
static inline int getHalfRateInstrCost() {
return 2 * TargetTransformInfo::TCC_Basic;
static inline int getHalfRateInstrCost(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
return CostKind == TTI::TCK_CodeSize ? 2
: 2 * TargetTransformInfo::TCC_Basic;
}
// TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
// should be 2 or 4.
static inline int getQuarterRateInstrCost() {
return 3 * TargetTransformInfo::TCC_Basic;
static inline int getQuarterRateInstrCost(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
return CostKind == TTI::TCK_CodeSize ? 2
: 4 * TargetTransformInfo::TCC_Basic;
}
// On some parts, normal fp64 operations are half rate, and others
// quarter. This also applies to some integer operations.
inline int get64BitInstrCost() const {
return ST->hasHalfRate64Ops() ?
getHalfRateInstrCost() : getQuarterRateInstrCost();
// On some parts, normal fp64 operations are half rate, and others
// quarter. This also applies to some integer operations.
inline int get64BitInstrCost(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
}
public:

View File

@ -1,9 +1,9 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s
; ALL: 'fadd_f32'
; ALL-LABEL: 'fadd_f32'
; ALL: estimated cost of 1 for {{.*}} fadd float
define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
%vec = load float, float addrspace(1)* %vaddr
@ -12,7 +12,7 @@ define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)
ret void
}
; ALL: 'fadd_v2f32'
; ALL-LABEL: 'fadd_v2f32'
; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@ -21,10 +21,8 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float
ret void
}
; ALL: 'fadd_v3f32'
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
; ALL-LABEL: 'fadd_v3f32'
; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fadd <3 x float> %vec, %b
@ -32,10 +30,8 @@ define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; ALL: 'fadd_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
; ALL-LABEL: 'fadd_v5f32'
; ALL: estimated cost of 5 for {{.*}} fadd <5 x float>
define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fadd <5 x float> %vec, %b
@ -43,9 +39,10 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float
ret void
}
; ALL: 'fadd_f64'
; ALL-LABEL: 'fadd_f64'
; FASTF64: estimated cost of 2 for {{.*}} fadd double
; SLOWF64: estimated cost of 3 for {{.*}} fadd double
; SLOWF64: estimated cost of 4 for {{.*}} fadd double
; SIZEALL: estimated cost of 2 for {{.*}} fadd double
define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fadd double %vec, %b
@ -53,9 +50,10 @@ define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(
ret void
}
; ALL: 'fadd_v2f64'
; ALL-LABEL: 'fadd_v2f64'
; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%add = fadd <2 x double> %vec, %b
@ -63,9 +61,10 @@ define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
ret void
}
; ALL: 'fadd_v3f64'
; ALL-LABEL: 'fadd_v3f64'
; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>
define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%add = fadd <3 x double> %vec, %b
@ -73,7 +72,7 @@ define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
ret void
}
; ALL: 'fadd_f16'
; ALL-LABEL: 'fadd_f16'
; ALL: estimated cost of 1 for {{.*}} fadd half
define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
%vec = load half, half addrspace(1)* %vaddr
@ -82,7 +81,7 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)*
ret void
}
; ALL: 'fadd_v2f16'
; ALL-LABEL: 'fadd_v2f16'
; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
@ -92,7 +91,7 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
ret void
}
; ALL: 'fadd_v3f16'
; ALL-LABEL: 'fadd_v3f16'
; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
@ -102,7 +101,7 @@ define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
ret void
}
; ALL: 'fadd_v4f16'
; ALL-LABEL: 'fadd_v4f16'
; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {

View File

@ -1,19 +1,18 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,NOFP16,NOFP16-FP32DENORM,SLOWFP32DENORMS %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,THRPTALL,CIFASTF64,NOFP16 %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,THRPTALL,CISLOWF64,NOFP16 %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,THRPTALL,SIFASTF64,NOFP16 %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,THRPTALL,SISLOWF64,NOFP16 %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,THRPTALL,FP16,CISLOWF64 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s
; ALL: 'fdiv_f32_ieee'
; ALL: estimated cost of 10 for {{.*}} fdiv float
; ALL-LABEL: 'fdiv_f32_ieee'
; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
%vec = load float, float addrspace(1)* %vaddr
%add = fdiv float %vec, %b
@ -21,8 +20,9 @@ define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspa
ret void
}
; ALL: 'fdiv_f32_ftzdaz'
; ALL: estimated cost of 12 for {{.*}} fdiv float
; ALL-LABEL: 'fdiv_f32_ftzdaz'
; THRPTALL: estimated cost of 16 for {{.*}} fdiv float
; SIZEALL: estimated cost of 14 for {{.*}} fdiv float
define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 {
%vec = load float, float addrspace(1)* %vaddr
%add = fdiv float %vec, %b
@ -30,8 +30,9 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrs
ret void
}
; ALL: 'fdiv_v2f32_ieee'
; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
; ALL-LABEL: 'fdiv_v2f32_ieee'
; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%add = fdiv <2 x float> %vec, %b
@ -39,8 +40,9 @@ define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x
ret void
}
; ALL: 'fdiv_v2f32_ftzdaz'
; ALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
; ALL-LABEL: 'fdiv_v2f32_ftzdaz'
; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float>
; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%add = fdiv <2 x float> %vec, %b
@ -48,10 +50,9 @@ define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2
ret void
}
; ALL: 'fdiv_v3f32_ieee'
; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 36/30 when it is legal.
; ALL: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
; ALL-LABEL: 'fdiv_v3f32_ieee'
; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fdiv <3 x float> %vec, %b
@ -59,10 +60,9 @@ define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x
ret void
}
; ALL: 'fdiv_v3f32_ftzdaz'
; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 36/30 when it is legal.
; ALL: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
; ALL-LABEL: 'fdiv_v3f32_ftzdaz'
; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float>
; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fdiv <3 x float> %vec, %b
@ -70,10 +70,9 @@ define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3
ret void
}
; ALL: 'fdiv_v5f32_ieee'
; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 60/50 when it is legal.
; ALL: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
; ALL-LABEL: 'fdiv_v5f32_ieee'
; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fdiv <5 x float> %vec, %b
@ -81,10 +80,9 @@ define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x
ret void
}
; ALL: 'fdiv_v5f32_ftzdaz'
; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 60/50 when it is legal.
; ALL: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
; ALL-LABEL: 'fdiv_v5f32_ftzdaz'
; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float>
; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fdiv <5 x float> %vec, %b
@ -92,11 +90,13 @@ define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5
ret void
}
; ALL: 'fdiv_f64'
; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
; ALL-LABEL: 'fdiv_f64'
; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
; SIZECI: estimated cost of 22 for {{.*}} fdiv double
; SIZESI: estimated cost of 25 for {{.*}} fdiv double
define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fdiv double %vec, %b
@ -104,11 +104,13 @@ define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(
ret void
}
; ALL: 'fdiv_v2f64'
; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double>
; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
; ALL-LABEL: 'fdiv_v2f64'
; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double>
; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double>
; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double>
; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double>
; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double>
; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double>
define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%add = fdiv <2 x double> %vec, %b
@ -116,11 +118,13 @@ define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
ret void
}
; ALL: 'fdiv_v3f64'
; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double>
; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
; ALL-LABEL: 'fdiv_v3f64'
; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double>
; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double>
; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double>
; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double>
; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double>
; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double>
define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%add = fdiv <3 x double> %vec, %b
@ -128,9 +132,11 @@ define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
ret void
}
; ALL: 'fdiv_f16_f32_ieee'
; NOFP16: estimated cost of 10 for {{.*}} fdiv half
; FP16: estimated cost of 10 for {{.*}} fdiv half
; ALL-LABEL: 'fdiv_f16_f32_ieee'
; NOFP16: estimated cost of 14 for {{.*}} fdiv half
; FP16: estimated cost of 12 for {{.*}} fdiv half
; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
%vec = load half, half addrspace(1)* %vaddr
%add = fdiv half %vec, %b
@ -138,9 +144,11 @@ define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrs
ret void
}
; ALL: 'fdiv_f16_f32_ftzdaz'
; NOFP16: estimated cost of 12 for {{.*}} fdiv half
; FP16: estimated cost of 10 for {{.*}} fdiv half
; ALL-LABEL: 'fdiv_f16_f32_ftzdaz'
; NOFP16: estimated cost of 16 for {{.*}} fdiv half
; FP16: estimated cost of 12 for {{.*}} fdiv half
; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 {
%vec = load half, half addrspace(1)* %vaddr
%add = fdiv half %vec, %b
@ -148,9 +156,11 @@ define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half add
ret void
}
; ALL: 'fdiv_v2f16_f32_ieee'
; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
; ALL-LABEL: 'fdiv_v2f16_f32_ieee'
; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%add = fdiv <2 x half> %vec, %b
@ -158,9 +168,11 @@ define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2
ret void
}
; ALL: 'fdiv_v2f16_f32_ftzdaz'
; NOFP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz'
; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%add = fdiv <2 x half> %vec, %b
@ -168,9 +180,11 @@ define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out,
ret void
}
; ALL: 'fdiv_v4f16_f32_ieee'
; NOFP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
; ALL-LABEL: 'fdiv_v4f16_f32_ieee'
; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half>
; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half>
; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
%add = fdiv <4 x half> %vec, %b
@ -178,9 +192,11 @@ define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4
ret void
}
; ALL: 'fdiv_v4f16_f32_ftzdaz'
; NOFP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz'
; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half>
; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half>
; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 {
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
%add = fdiv <4 x half> %vec, %b
@ -188,9 +204,9 @@ define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out,
ret void
}
; ALL: 'rcp_f32_ieee'
; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
; ALL-LABEL: 'rcp_f32_ieee'
; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
%vec = load float, float addrspace(1)* %vaddr
%add = fdiv float 1.0, %vec
@ -198,8 +214,9 @@ define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspac
ret void
}
; ALL: 'rcp_f32_ftzdaz'
; ALL: estimated cost of 3 for {{.*}} fdiv float
; ALL-LABEL: 'rcp_f32_ftzdaz'
; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 {
%vec = load float, float addrspace(1)* %vaddr
%add = fdiv float 1.0, %vec
@ -207,9 +224,11 @@ define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrsp
ret void
}
; ALL: 'rcp_f16_f32_ieee'
; NOFP16: estimated cost of 10 for {{.*}} fdiv half
; FP16: estimated cost of 3 for {{.*}} fdiv half
; ALL-LABEL: 'rcp_f16_f32_ieee'
; NOFP16: estimated cost of 14 for {{.*}} fdiv half
; FP16: estimated cost of 4 for {{.*}} fdiv half
; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
; SIZEF16: estimated cost of 2 for {{.*}} fdiv half
define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
%vec = load half, half addrspace(1)* %vaddr
%add = fdiv half 1.0, %vec
@ -217,9 +236,9 @@ define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrsp
ret void
}
; ALL: 'rcp_f16_f32_ftzdaz'
; NOFP16: estimated cost of 3 for {{.*}} fdiv half
; FP16: estimated cost of 3 for {{.*}} fdiv half
; ALL-LABEL: 'rcp_f16_f32_ftzdaz'
; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 {
%vec = load half, half addrspace(1)* %vaddr
%add = fdiv half 1.0, %vec
@ -227,11 +246,13 @@ define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addr
ret void
}
; ALL: 'rcp_f64'
; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
; ALL-LABEL: 'rcp_f64'
; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
; SIZECI: estimated cost of 22 for {{.*}} fdiv double
; SIZESI: estimated cost of 25 for {{.*}} fdiv double
define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fdiv double 1.0, %vec
@ -239,9 +260,9 @@ define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1
ret void
}
; ALL: 'rcp_v2f32_ieee'
; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
; ALL-LABEL: 'rcp_v2f32_ieee'
; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
@ -249,8 +270,9 @@ define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x f
ret void
}
; ALL: 'rcp_v2f32_ftzdaz'
; ALL: estimated cost of 6 for {{.*}} fdiv <2 x float>
; ALL-LABEL: 'rcp_v2f32_ftzdaz'
; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
@ -258,9 +280,11 @@ define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x
ret void
}
; ALL: 'rcp_v2f16_f32_ieee'
; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
; ALL-LABEL: 'rcp_v2f16_f32_ieee'
; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half>
; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half>
define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
@ -268,9 +292,9 @@ define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2
ret void
}
; ALL: 'rcp_v2f16_f32_ftzdaz'
; NOFP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz'
; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half>
; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half>
define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%add = fdiv <2 x half> <half 1.0, half 1.0>, %vec

View File

@ -1,11 +1,12 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF32,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s
; ALL-LABEL: 'fma_f32'
; SLOW32: estimated cost of 3 for {{.*}} call float @llvm.fma.f32
; FAST32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32
; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
%vec = load float, float addrspace(1)* %vaddr
%fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1
@ -14,8 +15,9 @@ define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)*
}
; ALL-LABEL: 'fma_v2f32'
; SLOW32: estimated cost of 6 for {{.*}} call <2 x float> @llvm.fma.v2f32
; FAST32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32
; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1
@ -24,8 +26,9 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float>
}
; ALL-LABEL: 'fma_v3f32'
; SLOW32: estimated cost of 9 for {{.*}} call <3 x float> @llvm.fma.v3f32
; FAST32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32
; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1
@ -34,8 +37,9 @@ define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float>
}
; ALL-LABEL: 'fma_v5f32'
; SLOW32: estimated cost of 15 for {{.*}} call <5 x float> @llvm.fma.v5f32
; FAST32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32
; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1
@ -44,8 +48,9 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float>
}
; ALL-LABEL: 'fma_f64'
; SLOW64: estimated cost of 3 for {{.*}} call double @llvm.fma.f64
; FAST64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64
; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
%vec = load double, double addrspace(1)* %vaddr
%fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1
@ -54,8 +59,9 @@ define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1
}
; ALL-LABEL: 'fma_v2f64'
; SLOW64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.fma.v2f64
; FAST64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64
; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1
@ -64,8 +70,9 @@ define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x doubl
}
; ALL-LABEL: 'fma_v3f64'
; SLOW64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.fma.v3f64
; FAST64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64
; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1
@ -74,8 +81,9 @@ define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x doubl
}
; ALL-LABEL: 'fma_f16'
; SLOW16: estimated cost of 3 for {{.*}} call half @llvm.fma.f16
; FAST16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16
; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
%vec = load half, half addrspace(1)* %vaddr
%fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1
@ -84,8 +92,10 @@ define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %
}
; ALL-LABEL: 'fma_v2f16'
; SLOW16: estimated cost of 6 for {{.*}} call <2 x half> @llvm.fma.v2f16
; FAST16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16
; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16
define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1
@ -94,8 +104,10 @@ define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a
}
; ALL-LABEL: 'fma_v3f16'
; SLOW16: estimated cost of 12 for {{.*}} call <3 x half> @llvm.fma.v3f16
; FAST16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16
; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16
define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
%fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1

View File

@ -1,7 +1,7 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s
; ALL-LABEL: 'fmul_f32'
; ALL: estimated cost of 1 for {{.*}} fmul float
@ -22,9 +22,7 @@ define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float
}
; ALL-LABEL: 'fmul_v3f32'
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fmul <3 x float> %vec, %b
@ -33,9 +31,7 @@ define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float
}
; ALL-LABEL: 'fmul_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
; ALL: estimated cost of 5 for {{.*}} fmul <5 x float>
define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fmul <5 x float> %vec, %b
@ -45,7 +41,8 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float
; ALL-LABEL: 'fmul_f64'
; FASTF64: estimated cost of 2 for {{.*}} fmul double
; SLOWF64: estimated cost of 3 for {{.*}} fmul double
; SLOWF64: estimated cost of 4 for {{.*}} fmul double
; SIZEALL: estimated cost of 2 for {{.*}} fmul double
define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fmul double %vec, %b
@ -55,7 +52,8 @@ define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(
; ALL-LABEL: 'fmul_v2f64'
; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double>
; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double>
define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%add = fmul <2 x double> %vec, %b
@ -65,7 +63,8 @@ define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
; ALL-LABEL: 'fmul_v3f64'
; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double>
; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double>
define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%add = fmul <3 x double> %vec, %b

View File

@ -1,9 +1,9 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s
; ALL: 'fsub_f32'
; ALL-LABEL: 'fsub_f32'
; ALL: estimated cost of 1 for {{.*}} fsub float
define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
%vec = load float, float addrspace(1)* %vaddr
@ -12,7 +12,7 @@ define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)
ret void
}
; ALL: 'fsub_v2f32'
; ALL-LABEL: 'fsub_v2f32'
; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@ -21,10 +21,8 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float
ret void
}
; ALL: 'fsub_v3f32'
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float>
; ALL-LABEL: 'fsub_v3f32'
; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fsub <3 x float> %vec, %b
@ -32,10 +30,8 @@ define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; ALL: 'fsub_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float>
; ALL-LABEL: 'fsub_v5f32'
; ALL: estimated cost of 5 for {{.*}} fsub <5 x float>
define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fsub <5 x float> %vec, %b
@ -43,9 +39,10 @@ define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float
ret void
}
; ALL: 'fsub_f64'
; ALL-LABEL: 'fsub_f64'
; FASTF64: estimated cost of 2 for {{.*}} fsub double
; SLOWF64: estimated cost of 3 for {{.*}} fsub double
; SLOWF64: estimated cost of 4 for {{.*}} fsub double
; SIZEALL: estimated cost of 2 for {{.*}} fsub double
define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fsub double %vec, %b
@ -53,9 +50,10 @@ define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(
ret void
}
; ALL: 'fsub_v2f64'
; ALL-LABEL: 'fsub_v2f64'
; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double>
; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%add = fsub <2 x double> %vec, %b
@ -63,9 +61,10 @@ define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
ret void
}
; ALL: 'fsub_v3f64'
; ALL-LABEL: 'fsub_v3f64'
; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double>
; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double>
define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%add = fsub <3 x double> %vec, %b
@ -73,7 +72,7 @@ define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
ret void
}
; ALL: 'fsub_f16'
; ALL-LABEL: 'fsub_f16'
; ALL: estimated cost of 1 for {{.*}} fsub half
define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
%vec = load half, half addrspace(1)* %vaddr
@ -82,7 +81,7 @@ define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)*
ret void
}
; ALL: 'fsub_v2f16'
; ALL-LABEL: 'fsub_v2f16'
; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half>
; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half>
define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
@ -92,7 +91,7 @@ define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
ret void
}
; ALL: 'fsub_v3f16'
; ALL-LABEL: 'fsub_v3f16'
; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half>
; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half>
define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
@ -102,7 +101,7 @@ define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
ret void
}
; ALL: 'fsub_v4f16'
; ALL-LABEL: 'fsub_v4f16'
; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half>
; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half>
define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {

View File

@ -1,11 +1,11 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,THRPTALL,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,THRPTALL,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,THRPTALL,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,THRPTALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,SZNOCONTRACT,SIZEALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s
target triple = "amdgcn--"
@ -113,8 +113,10 @@ define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r
; ALL-LABEL: 'fmul_fadd_f64':
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double
; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double
; ALL: estimated cost of 3 for instruction: %add = fadd double
; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double
; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double
; THRPTALL: estimated cost of 4 for instruction: %add = fadd double
; SIZEALL: estimated cost of 2 for instruction: %add = fadd double
define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
%mul = fmul double %r0, %r1
%add = fadd double %mul, %r2
@ -123,7 +125,8 @@ define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
; ALL-LABEL: 'fmul_fadd_contract_f64':
; ALL: estimated cost of 0 for instruction: %mul = fmul contract double
; ALL: estimated cost of 3 for instruction: %add = fadd contract double
; THRPTALL: estimated cost of 4 for instruction: %add = fadd contract double
; SIZEALL: estimated cost of 2 for instruction: %add = fadd contract double
define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
%mul = fmul contract double %r0, %r1
%add = fadd contract double %mul, %r2
@ -132,8 +135,10 @@ define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
; ALL-LABEL: 'fmul_fadd_v2f64':
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double>
; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double>
; ALL: estimated cost of 6 for instruction: %add = fadd <2 x double>
; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double>
; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double>
; THRPTALL: estimated cost of 8 for instruction: %add = fadd <2 x double>
; SIZEALL: estimated cost of 4 for instruction: %add = fadd <2 x double>
define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
%mul = fmul <2 x double> %r0, %r1
%add = fadd <2 x double> %mul, %r2
@ -142,8 +147,10 @@ define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x do
; ALL-LABEL: 'fmul_fsub_f64':
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double
; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double
; ALL: estimated cost of 3 for instruction: %sub = fsub double
; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double
; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double
; THRPTALL: estimated cost of 4 for instruction: %sub = fsub double
; SIZEALL: estimated cost of 2 for instruction: %sub = fsub double
define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
%mul = fmul double %r0, %r1
%sub = fsub double %mul, %r2
@ -152,8 +159,10 @@ define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
; ALL-LABEL: 'fmul_fsub_v2f64':
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double>
; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double>
; ALL: estimated cost of 6 for instruction: %sub = fsub <2 x double>
; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double>
; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double>
; THRPTALL: estimated cost of 8 for instruction: %sub = fsub <2 x double>
; SIZEALL: estimated cost of 4 for instruction: %sub = fsub <2 x double>
define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
%mul = fmul <2 x double> %r0, %r1
%sub = fsub <2 x double> %mul, %r2

View File

@ -1,10 +1,11 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,THRPTALL,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s
; ALL: 'mul_i32'
; ALL: estimated cost of 3 for {{.*}} mul i32
; ALL-LABEL: 'mul_i32'
; THRPTALL: estimated cost of 4 for {{.*}} mul i32
; SIZEALL: estimated cost of 2 for {{.*}} mul i32
define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
%vec = load i32, i32 addrspace(1)* %vaddr
%mul = mul i32 %vec, %b
@ -12,8 +13,9 @@ define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
ret void
}
; ALL: 'mul_v2i32'
; ALL: estimated cost of 6 for {{.*}} mul <2 x i32>
; ALL-LABEL: 'mul_v2i32'
; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32>
; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32>
define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
%mul = mul <2 x i32> %vec, %b
@ -21,10 +23,9 @@ define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
ret void
}
; ALL: 'mul_v3i32'
; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening,
; and 9 when it is legal.
; ALL: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
; ALL-LABEL: 'mul_v3i32'
; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32>
; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32>
define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
%mul = mul <3 x i32> %vec, %b
@ -32,10 +33,9 @@ define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add
ret void
}
; ALL: 'mul_v5i32'
; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening,
; and 15 when it is legal.
; ALL: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
; ALL-LABEL: 'mul_v5i32'
; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
%mul = mul <5 x i32> %vec, %b
@ -43,8 +43,9 @@ define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> add
ret void
}
; ALL: 'mul_v4i32'
; ALL: estimated cost of 12 for {{.*}} mul <4 x i32>
; ALL-LABEL: 'mul_v4i32'
; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32>
; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32>
define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
%mul = mul <4 x i32> %vec, %b
@ -52,8 +53,9 @@ define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
ret void
}
; ALL: 'mul_i64'
; ALL: estimated cost of 16 for {{.*}} mul i64
; ALL-LABEL: 'mul_i64'
; THRPTALL: estimated cost of 20 for {{.*}} mul i64
; SIZEALL: estimated cost of 12 for {{.*}} mul i64
define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
%vec = load i64, i64 addrspace(1)* %vaddr
%mul = mul i64 %vec, %b
@ -61,8 +63,9 @@ define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
ret void
}
; ALL: 'mul_v2i64'
; ALL: estimated cost of 32 for {{.*}} mul <2 x i64>
; ALL-LABEL: 'mul_v2i64'
; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64>
; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64>
define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
%mul = mul <2 x i64> %vec, %b
@ -70,8 +73,9 @@ define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> add
ret void
}
; ALL: 'mul_v3i64'
; ALL: estimated cost of 48 for {{.*}} mul <3 x i64>
; ALL-LABEL: 'mul_v3i64'
; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64>
; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64>
define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
%mul = mul <3 x i64> %vec, %b
@ -79,8 +83,9 @@ define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> add
ret void
}
; ALL: 'mul_v4i64'
; ALL: estimated cost of 64 for {{.*}} mul <4 x i64>
; ALL-LABEL: 'mul_v4i64'
; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64>
; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64>
define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
%mul = mul <4 x i64> %vec, %b
@ -89,8 +94,9 @@ define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
}
; ALL: 'mul_v8i64'
; ALL: estimated cost of 256 for {{.*}} mul <8 x i64>
; ALL-LABEL: 'mul_v8i64'
; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64>
; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64>
define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
%vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
%mul = mul <8 x i64> %vec, %b
@ -98,8 +104,9 @@ define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> add
ret void
}
; ALL: 'mul_i16'
; ALL: estimated cost of 3 for {{.*}} mul i16
; ALL-LABEL: 'mul_i16'
; THRPTALL: estimated cost of 4 for {{.*}} mul i16
; SIZEALL: estimated cost of 2 for {{.*}} mul i16
define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
%vec = load i16, i16 addrspace(1)* %vaddr
%mul = mul i16 %vec, %b
@ -107,9 +114,11 @@ define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va
ret void
}
; ALL: 'mul_v2i16'
; SLOW16: estimated cost of 6 for {{.*}} mul <2 x i16>
; FAST16: estimated cost of 3 for {{.*}} mul <2 x i16>
; ALL-LABEL: 'mul_v2i16'
; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16>
; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16>
; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16>
; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16>
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
%mul = mul <2 x i16> %vec, %b
@ -117,9 +126,11 @@ define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
ret void
}
; ALL: 'mul_v3i16'
; SLOW16: estimated cost of 12 for {{.*}} mul <3 x i16>
; FAST16: estimated cost of 6 for {{.*}} mul <3 x i16>
; ALL-LABEL: 'mul_v3i16'
; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16>
; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16>
; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16>
; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16>
define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 {
%vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr
%mul = mul <3 x i16> %vec, %b

View File

@ -1,9 +1,9 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s
; ALL: 'shl_i32'
; ALL-LABEL: 'shl_i32'
; ALL: estimated cost of 1 for {{.*}} shl i32
define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
%vec = load i32, i32 addrspace(1)* %vaddr
@ -12,9 +12,10 @@ define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
ret void
}
; ALL: 'shl_i64'
; ALL-LABEL: 'shl_i64'
; FAST64: estimated cost of 2 for {{.*}} shl i64
; SLOW64: estimated cost of 3 for {{.*}} shl i64
; SLOW64: estimated cost of 4 for {{.*}} shl i64
; SIZEALL: estimated cost of 2 for {{.*}} shl i64
define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
%vec = load i64, i64 addrspace(1)* %vaddr
%or = shl i64 %vec, %b
@ -22,7 +23,7 @@ define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
ret void
}
; ALL: 'shl_i16'
; ALL-LABEL: 'shl_i16'
; ALL: estimated cost of 1 for {{.*}} shl i16
define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
%vec = load i16, i16 addrspace(1)* %vaddr
@ -31,7 +32,7 @@ define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va
ret void
}
; ALL: 'shl_v2i16'
; ALL-LABEL: 'shl_v2i16'
; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16>
; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16>
define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
@ -41,7 +42,7 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
ret void
}
; ALL: 'lshr_i32'
; ALL-LABEL: 'lshr_i32'
; ALL: estimated cost of 1 for {{.*}} lshr i32
define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
%vec = load i32, i32 addrspace(1)* %vaddr
@ -50,9 +51,10 @@ define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %v
ret void
}
; ALL: 'lshr_i64'
; ALL-LABEL: 'lshr_i64'
; FAST64: estimated cost of 2 for {{.*}} lshr i64
; SLOW64: estimated cost of 3 for {{.*}} lshr i64
; SLOW64: estimated cost of 4 for {{.*}} lshr i64
; SIZEALL: estimated cost of 2 for {{.*}} lshr i64
define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
%vec = load i64, i64 addrspace(1)* %vaddr
%or = lshr i64 %vec, %b
@ -60,7 +62,7 @@ define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v
ret void
}
; ALL: 'lshr_i16'
; ALL-LABEL: 'lshr_i16'
; ALL: estimated cost of 1 for {{.*}} lshr i16
define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
%vec = load i16, i16 addrspace(1)* %vaddr
@ -69,7 +71,7 @@ define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %v
ret void
}
; ALL: 'lshr_v2i16'
; ALL-LABEL: 'lshr_v2i16'
; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16>
; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16>
define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
@ -79,7 +81,7 @@ define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ad
ret void
}
; ALL: 'ashr_i32'
; ALL-LABEL: 'ashr_i32'
; ALL: estimated cost of 1 for {{.*}} ashr i32
define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
%vec = load i32, i32 addrspace(1)* %vaddr
@ -88,9 +90,9 @@ define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %v
ret void
}
; ALL: 'ashr_i64'
; ALL-LABEL: 'ashr_i64'
; FAST64: estimated cost of 2 for {{.*}} ashr i64
; SLOW64: estimated cost of 3 for {{.*}} ashr i64
; SLOW64: estimated cost of 4 for {{.*}} ashr i64
define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
%vec = load i64, i64 addrspace(1)* %vaddr
%or = ashr i64 %vec, %b
@ -98,7 +100,7 @@ define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v
ret void
}
; ALL: 'ashr_i16'
; ALL-LABEL: 'ashr_i16'
; ALL: estimated cost of 1 for {{.*}} ashr i16
define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
%vec = load i16, i16 addrspace(1)* %vaddr
@ -107,7 +109,7 @@ define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %v
ret void
}
; ALL: 'ashr_v2i16'
; ALL-LABEL: 'ashr_v2i16'
; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16>
; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16>
define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {