From 2fc96e575d5d418af21ee50efba5a762916cde3e Mon Sep 17 00:00:00 2001
From: dfukalov <daniil.fukalov@amd.com>
Date: Mon, 18 Nov 2019 16:42:34 +0300
Subject: [PATCH] [AMDGPU] Tune inlining parameters for AMDGPU target (part 2)

Summary:
Most of IR instructions got better code size estimations after commit 47a5c36b.
So default parameters values should be updated to improve inlining and
unrolling for the target.

Reviewers: rampitec, arsenm

Reviewed By: rampitec

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, zzheng, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70391
---
 lib/Target/AMDGPU/AMDGPUInline.cpp                      | 2 +-
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp         | 2 +-
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h           | 2 +-
 test/CodeGen/AMDGPU/amdgpu-inline.ll                    | 7 +++++++
 test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll | 2 +-
 5 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index 4bb7b9d467a..64d761997b0 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
 #define DEBUG_TYPE "inline"
 
 static cl::opt<int>
-ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
+ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000),
               cl::desc("Cost of alloca argument"));
 
 // If the amount of scratch memory to eliminate exceeds our ability to allocate
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c1b475dd101..0d44f3be539 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -57,7 +57,7 @@ using namespace llvm;
 static cl::opt<unsigned> UnrollThresholdPrivate(
   "amdgpu-unroll-threshold-private",
   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
-  cl::init(2000), cl::Hidden);
+  cl::init(2700), cl::Hidden);
 
 static cl::opt<unsigned> UnrollThresholdLocal(
   "amdgpu-unroll-threshold-local",
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 67f7f9074f1..b6e2db454e6 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -204,7 +204,7 @@ public:
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
-  unsigned getInliningThresholdMultiplier() { return 9; }
+  unsigned getInliningThresholdMultiplier() { return 11; }
 
   int getInlinerVectorBonusPercent() { return 0; }
 
diff --git a/test/CodeGen/AMDGPU/amdgpu-inline.ll b/test/CodeGen/AMDGPU/amdgpu-inline.ll
index c2f1836f44a..243522e28dd 100644
--- a/test/CodeGen/AMDGPU/amdgpu-inline.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -28,8 +28,15 @@ if.end:                                           ; preds = %if.then, %entry
 define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) {
 entry:
   %tmp1 = load float, float addrspace(5)* %p1, align 4
+  %cmp = fcmp ogt float %tmp1, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
   %div = fdiv float 2.000000e+00, %tmp1
   store float %div, float addrspace(5)* %p2, align 4
+  br label %if.end
+
+if.end:
   ret void
 }
 
diff --git a/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
index 74e124cf6af..d1fbc87602b 100644
--- a/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
+++ b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
@@ -1,4 +1,4 @@
-; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=12000 %s | FileCheck %s
+; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S %s | FileCheck %s
 
 ; Check that we full unroll loop to be able to eliminate alloca
 ; CHECK-LABEL: @non_invariant_ind