mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
[AMDGPU] Improve code size cost model (part 2)
Summary: Added estimations for ShuffleVector, some cast and arithmetic instructions Reviewers: rampitec Reviewed By: rampitec Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, zzheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69629
This commit is contained in:
parent
2a67a1de31
commit
3c6fc97f38
@ -695,34 +695,114 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
|
||||
unsigned GCNTTIImpl::getUserCost(const User *U,
|
||||
ArrayRef<const Value *> Operands) {
|
||||
// Estimate extractelement elimination
|
||||
if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
|
||||
const Instruction *I = dyn_cast<Instruction>(U);
|
||||
if (!I)
|
||||
return BaseT::getUserCost(U, Operands);
|
||||
|
||||
// Estimate different operations to be optimized out
|
||||
switch (I->getOpcode()) {
|
||||
case Instruction::ExtractElement: {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
|
||||
unsigned Idx = -1;
|
||||
if (CI)
|
||||
Idx = CI->getZExtValue();
|
||||
return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
|
||||
Idx);
|
||||
return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
|
||||
}
|
||||
|
||||
// Estimate insertelement elimination
|
||||
if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
|
||||
case Instruction::InsertElement: {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
|
||||
unsigned Idx = -1;
|
||||
if (CI)
|
||||
Idx = CI->getZExtValue();
|
||||
return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
|
||||
return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
|
||||
}
|
||||
case Instruction::Call: {
|
||||
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
|
||||
SmallVector<Value *, 4> Args(II->arg_operands());
|
||||
FastMathFlags FMF;
|
||||
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
|
||||
FMF);
|
||||
} else {
|
||||
return BaseT::getUserCost(U, Operands);
|
||||
}
|
||||
}
|
||||
case Instruction::ShuffleVector: {
|
||||
const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
|
||||
Type *Ty = Shuffle->getType();
|
||||
Type *SrcTy = Shuffle->getOperand(0)->getType();
|
||||
|
||||
// TODO: Identify and add costs for insert subvector, etc.
|
||||
int SubIndex;
|
||||
if (Shuffle->isExtractSubvectorMask(SubIndex))
|
||||
return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);
|
||||
|
||||
if (Shuffle->changesLength())
|
||||
return -1;
|
||||
|
||||
if (Shuffle->isIdentity())
|
||||
return 0;
|
||||
|
||||
if (Shuffle->isReverse())
|
||||
return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);
|
||||
|
||||
if (Shuffle->isSelect())
|
||||
return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);
|
||||
|
||||
if (Shuffle->isTranspose())
|
||||
return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);
|
||||
|
||||
if (Shuffle->isZeroEltSplat())
|
||||
return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);
|
||||
|
||||
if (Shuffle->isSingleSource())
|
||||
return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);
|
||||
|
||||
return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
|
||||
}
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
case Instruction::FPToUI:
|
||||
case Instruction::FPToSI:
|
||||
case Instruction::FPExt:
|
||||
case Instruction::PtrToInt:
|
||||
case Instruction::IntToPtr:
|
||||
case Instruction::SIToFP:
|
||||
case Instruction::UIToFP:
|
||||
case Instruction::Trunc:
|
||||
case Instruction::FPTrunc:
|
||||
case Instruction::BitCast:
|
||||
case Instruction::AddrSpaceCast: {
|
||||
return getCastInstrCost(I->getOpcode(), I->getType(),
|
||||
I->getOperand(0)->getType(), I);
|
||||
}
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
case Instruction::Sub:
|
||||
case Instruction::FSub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::FMul:
|
||||
case Instruction::UDiv:
|
||||
case Instruction::SDiv:
|
||||
case Instruction::FDiv:
|
||||
case Instruction::URem:
|
||||
case Instruction::SRem:
|
||||
case Instruction::FRem:
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor:
|
||||
case Instruction::FNeg: {
|
||||
return getArithmeticInstrCost(I->getOpcode(), I->getType(),
|
||||
TTI::OK_AnyValue, TTI::OK_AnyValue,
|
||||
TTI::OP_None, TTI::OP_None, Operands);
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// Estimate different intrinsics, e.g. llvm.fabs
|
||||
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
|
||||
SmallVector<Value *, 4> Args(II->arg_operands());
|
||||
FastMathFlags FMF;
|
||||
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
|
||||
FMF);
|
||||
}
|
||||
return BaseT::getUserCost(U, Operands);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,8 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s
|
||||
|
||||
|
||||
; CHECK: 'add_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} add i32
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: 'addrspacecast_global_to_flat'
|
||||
; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8*
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
|
||||
; CHECK: 'or_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} or i32
|
||||
|
@ -1,5 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
|
||||
; ALL: 'fadd_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd float
|
||||
|
@ -5,6 +5,13 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s
|
||||
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s
|
||||
|
||||
; ALL: 'fdiv_f32'
|
||||
; NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv float
|
||||
; FP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
|
||||
|
@ -1,5 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
|
||||
; ALL: 'fmul_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fmul float
|
||||
|
@ -1,5 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
|
||||
; ALL: 'fsub_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub float
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
|
||||
; CHECK: 'mul_i32'
|
||||
; CHECK: estimated cost of 3 for {{.*}} mul i32
|
||||
|
@ -1,5 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s
|
||||
|
||||
; ALL: 'shl_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} shl i32
|
||||
|
@ -1,7 +1,11 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
|
||||
|
||||
; GCN-LABEL: 'shufflevector_00_v2i16'
|
||||
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
|
||||
; VI: estimated cost of 1 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
|
||||
define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
|
||||
@ -9,7 +13,8 @@ define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out,
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
|
||||
; GCN-LABEL: 'shufflevector_01_v2i16'
|
||||
; GCN: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
|
||||
define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
|
||||
@ -17,7 +22,9 @@ define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out,
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'shufflevector_10_v2i16'
|
||||
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
; VI: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
@ -25,7 +32,9 @@ define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out,
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'shufflevector_11_v2i16'
|
||||
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
|
||||
; VI: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
|
||||
define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
|
||||
@ -33,6 +42,7 @@ define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out,
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'shufflevector_02_v2i16'
|
||||
; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
|
||||
define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) {
|
||||
%vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s
|
||||
; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=12000 %s | FileCheck %s
|
||||
|
||||
; Check that we full unroll loop to be able to eliminate alloca
|
||||
; CHECK-LABEL: @non_invariant_ind
|
||||
|
Loading…
Reference in New Issue
Block a user