From f13ef266137499575a2b8f23cb24ad9c6395f07d Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 22 Jul 2021 18:19:54 +0100
Subject: [PATCH] [AArch64] Adjust the cost of integer sum reductions

This changes the cost to (LT.first-1) * cost(add) + 2, where the cost of
an add is assumed to be 1. This brings it inline with the other
reductions.

Differential Revision: https://reviews.llvm.org/D106240
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 17 ++++++-------
 test/Analysis/CostModel/AArch64/reduce-add.ll | 24 +++++++++----------
 .../CostModel/AArch64/vector-reduce.ll        | 10 ++++----
 .../SLPVectorizer/AArch64/gather-cost.ll      |  2 +-
 .../SLPVectorizer/AArch64/horizontal.ll       |  6 ++---
 5 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4ab754465f3..1d2d49b287d 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1966,18 +1966,19 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   assert(ISD && "Invalid opcode");
 
   // Horizontal adds can use the 'addv' instruction. We model the cost of these
-  // instructions as normal vector adds. This is the only arithmetic vector
-  // reduction operation for which we have an instruction.
+  // instructions as twice a normal vector add, plus 1 for each legalization
+  // step (LT.first). This is the only arithmetic vector reduction operation for
+  // which we have an instruction.
   // OR, XOR and AND costs should match the codegen from:
   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
   static const CostTblEntry CostTblNoPairwise[]{
-      {ISD::ADD, MVT::v8i8,   1},
-      {ISD::ADD, MVT::v16i8,  1},
-      {ISD::ADD, MVT::v4i16,  1},
-      {ISD::ADD, MVT::v8i16,  1},
-      {ISD::ADD, MVT::v4i32,  1},
+      {ISD::ADD, MVT::v8i8,   2},
+      {ISD::ADD, MVT::v16i8,  2},
+      {ISD::ADD, MVT::v4i16,  2},
+      {ISD::ADD, MVT::v8i16,  2},
+      {ISD::ADD, MVT::v4i32,  2},
       {ISD::OR,  MVT::v8i8,  15},
       {ISD::OR,  MVT::v16i8, 17},
       {ISD::OR,  MVT::v4i16,  7},
@@ -2005,7 +2006,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     break;
   case ISD::ADD:
     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
-      return LT.first * Entry->Cost;
+      return (LT.first - 1) + Entry->Cost;
     break;
   case ISD::XOR:
   case ISD::AND:
diff --git a/test/Analysis/CostModel/AArch64/reduce-add.ll b/test/Analysis/CostModel/AArch64/reduce-add.ll
index 8bbb96f20dc..e1f2af680a6 100644
--- a/test/Analysis/CostModel/AArch64/reduce-add.ll
+++ b/test/Analysis/CostModel/AArch64/reduce-add.ll
@@ -3,20 +3,20 @@
 
 define void @reduce() {
 ; CHECK-LABEL: 'reduce'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1i8 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1i8 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i8 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4i64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/test/Analysis/CostModel/AArch64/vector-reduce.ll b/test/Analysis/CostModel/AArch64/vector-reduce.ll
index df25a6fedf2..4aef33c632d 100644
--- a/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -3,7 +3,7 @@
 
 define i8 @add.i8.v8i8(<8 x i8> %v) {
 ; COST-LABEL: 'add.i8.v8i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
@@ -12,7 +12,7 @@ define i8 @add.i8.v8i8(<8 x i8> %v) {
 
 define i8 @add.i8.v16i8(<16 x i8> %v) {
 ; COST-LABEL: 'add.i8.v16i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
@@ -21,7 +21,7 @@ define i8 @add.i8.v16i8(<16 x i8> %v) {
 
 define i16 @add.i16.v4i16(<4 x i16> %v) {
 ; COST-LABEL: 'add.i16.v4i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
@@ -30,7 +30,7 @@ define i16 @add.i16.v4i16(<4 x i16> %v) {
 
 define i16 @add.i16.v8i16(<8 x i16> %v) {
 ; COST-LABEL: 'add.i16.v8i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
@@ -39,7 +39,7 @@ define i16 @add.i16.v8i16(<8 x i16> %v) {
 
 define i32 @add.i32.v4i32(<4 x i32> %v) {
 ; COST-LABEL: 'add.i32.v4i32'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index 57db62ace20..22370f55e12 100644
--- a/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"
 ; REMARK-LABEL: Function: gather_multiple_use
 ; REMARK:       Args:
 ; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
-; REMARK-NEXT:    - Cost: '-7'
+; REMARK-NEXT:    - Cost: '-6'
 ;
 ; REMARK-NOT: Function: gather_load
 
diff --git a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 2666a9f3bd6..a9efaf3b07d 100644
--- a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -15,7 +15,7 @@ target triple = "aarch64--linux"
 ; YAML-NEXT: Function:        test_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-20'
+; YAML-NEXT:   - Cost:            '-19'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '8'
 
@@ -143,7 +143,7 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
 ; YAML-NEXT: Function:        reduction_with_br
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-11'
+; YAML-NEXT:   - Cost:            '-10'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 ; CHECK-LABEL: @reduction_with_br(
@@ -244,7 +244,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT: Function:        test_unrolled_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-37'
+; YAML-NEXT:   - Cost:            '-36'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '10'