From 2fc96e575d5d418af21ee50efba5a762916cde3e Mon Sep 17 00:00:00 2001 From: dfukalov Date: Mon, 18 Nov 2019 16:42:34 +0300 Subject: [PATCH] [AMDGPU] Tune inlining parameters for AMDGPU target (part 2) Summary: Most of IR instructions got better code size estimations after commit 47a5c36b. So default parameters values should be updated to improve inlining and unrolling for the target. Reviewers: rampitec, arsenm Reviewed By: rampitec Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, zzheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70391 --- lib/Target/AMDGPU/AMDGPUInline.cpp | 2 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 +- test/CodeGen/AMDGPU/amdgpu-inline.ll | 7 +++++++ test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll | 2 +- 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index 4bb7b9d467a..64d761997b0 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -39,7 +39,7 @@ using namespace llvm; #define DEBUG_TYPE "inline" static cl::opt -ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500), +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument")); // If the amount of scratch memory to eliminate exceeds our ability to allocate diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c1b475dd101..0d44f3be539 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -57,7 +57,7 @@ using namespace llvm; static cl::opt UnrollThresholdPrivate( "amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), - cl::init(2000), cl::Hidden); + cl::init(2700), cl::Hidden); static cl::opt UnrollThresholdLocal( "amdgpu-unroll-threshold-local", diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 67f7f9074f1..b6e2db454e6 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -204,7 +204,7 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 9; } + unsigned getInliningThresholdMultiplier() { return 11; } int getInlinerVectorBonusPercent() { return 0; } diff --git a/test/CodeGen/AMDGPU/amdgpu-inline.ll b/test/CodeGen/AMDGPU/amdgpu-inline.ll index c2f1836f44a..243522e28dd 100644 --- a/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -28,8 +28,15 @@ if.end: ; preds = %if.then, %entry define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) { entry: %tmp1 = load float, float addrspace(5)* %p1, align 4 + %cmp = fcmp ogt float %tmp1, 1.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: %div = fdiv float 2.000000e+00, %tmp1 store float %div, float addrspace(5)* %p2, align 4 + br label %if.end + +if.end: ret void } diff --git a/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll index 74e124cf6af..d1fbc87602b 100644 --- a/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll +++ b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll @@ -1,4 +1,4 @@ -; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=12000 %s | FileCheck %s +; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S %s | FileCheck %s ; Check that we full unroll loop to be able to eliminate alloca ; CHECK-LABEL: @non_invariant_ind