1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[REVERT][LV][X86] update the cost of interleaving mem. access of floats

reverted my changes will be committed later after fixing the failure
This patch contains update of the costs of interleaved loads of v8f32 of stride 3 and 8.

Differential Revision: https://reviews.llvm.org/D39403

llvm-svn: 317433
This commit is contained in:
Mohammed Agabaria 2017-11-05 09:36:54 +00:00
parent 58aaaace99
commit 01b828cb80
2 changed files with 1 additions and 145 deletions

View File

@ -2644,15 +2644,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
{ 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
{ 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
{ 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
{ 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
{ 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
{ 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {

View File

@ -1,141 +0,0 @@
; REQUIRES: asserts
; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386-unknown-linux-gnu"
@src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
@dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
; Function Attrs: norecurse nounwind
define void @stride8(float %k, i32 %width_) {
entry:
; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float
%cmp72 = icmp sgt i32 %width_, 0
br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.body.lr.ph, %for.body
%i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
%arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
%0 = load float, float* %arrayidx, align 4
%mul = fmul fast float %0, %k
%arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
%1 = load float, float* %arrayidx2, align 4
%add3 = fadd fast float %1, %mul
store float %add3, float* %arrayidx2, align 4
%add4 = or i32 %i.073, 1
%arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
%2 = load float, float* %arrayidx5, align 4
%mul6 = fmul fast float %2, %k
%arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
%3 = load float, float* %arrayidx8, align 4
%add9 = fadd fast float %3, %mul6
store float %add9, float* %arrayidx8, align 4
%add10 = or i32 %i.073, 2
%arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
%4 = load float, float* %arrayidx11, align 4
%mul12 = fmul fast float %4, %k
%arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
%5 = load float, float* %arrayidx14, align 4
%add15 = fadd fast float %5, %mul12
store float %add15, float* %arrayidx14, align 4
%add16 = or i32 %i.073, 3
%arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
%6 = load float, float* %arrayidx17, align 4
%mul18 = fmul fast float %6, %k
%arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
%7 = load float, float* %arrayidx20, align 4
%add21 = fadd fast float %7, %mul18
store float %add21, float* %arrayidx20, align 4
%add22 = or i32 %i.073, 4
%arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
%8 = load float, float* %arrayidx23, align 4
%mul24 = fmul fast float %8, %k
%arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
%9 = load float, float* %arrayidx26, align 4
%add27 = fadd fast float %9, %mul24
store float %add27, float* %arrayidx26, align 4
%add28 = or i32 %i.073, 5
%arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
%10 = load float, float* %arrayidx29, align 4
%mul30 = fmul fast float %10, %k
%arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
%11 = load float, float* %arrayidx32, align 4
%add33 = fadd fast float %11, %mul30
store float %add33, float* %arrayidx32, align 4
%add34 = or i32 %i.073, 6
%arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
%12 = load float, float* %arrayidx35, align 4
%mul36 = fmul fast float %12, %k
%arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
%13 = load float, float* %arrayidx38, align 4
%add39 = fadd fast float %13, %mul36
store float %add39, float* %arrayidx38, align 4
%add40 = or i32 %i.073, 7
%arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
%14 = load float, float* %arrayidx41, align 4
%mul42 = fmul fast float %14, %k
%arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
%15 = load float, float* %arrayidx44, align 4
%add45 = fadd fast float %15, %mul42
store float %add45, float* %arrayidx44, align 4
%add46 = add nuw nsw i32 %i.073, 8
%cmp = icmp slt i32 %add46, %width_
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}
; Function Attrs: norecurse nounwind
define void @stride3(float %k, i32 %width_) {
entry:
; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float
%cmp27 = icmp sgt i32 %width_, 0
br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.lr.ph, %for.body
%i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
%arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
%0 = load float, float* %arrayidx, align 4
%mul = fmul fast float %0, %k
%arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
%1 = load float, float* %arrayidx2, align 4
%add3 = fadd fast float %1, %mul
store float %add3, float* %arrayidx2, align 4
%add4 = add nuw nsw i32 %i.028, 1
%arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
%2 = load float, float* %arrayidx5, align 4
%mul6 = fmul fast float %2, %k
%arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
%3 = load float, float* %arrayidx8, align 4
%add9 = fadd fast float %3, %mul6
store float %add9, float* %arrayidx8, align 4
%add10 = add nuw nsw i32 %i.028, 2
%arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
%4 = load float, float* %arrayidx11, align 4
%mul12 = fmul fast float %4, %k
%arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
%5 = load float, float* %arrayidx14, align 4
%add15 = fadd fast float %5, %mul12
store float %add15, float* %arrayidx14, align 4
%add16 = add nuw nsw i32 %i.028, 3
%cmp = icmp slt i32 %add16, %width_
br i1 %cmp, label %for.body, label %for.cond.cleanup
}