1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

[X86] Make some cast costs more precise

Make some AVX and AVX512 cast costs more precise.
Based on part of a patch by Elena Demikhovsky (D15604).

Differential Revision: http://reviews.llvm.org/D22064

llvm-svn: 275106
This commit is contained in:
Michael Kuperstein 2016-07-11 21:39:44 +00:00
parent b281da1235
commit 7e6a08b33c
5 changed files with 53 additions and 40 deletions

View File

@ -547,6 +547,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
};
// TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
// 256-bit wide vectors.
static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
@ -577,6 +580,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
@ -591,11 +596,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
@ -685,6 +692,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
@ -693,9 +701,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
// here. We have roughly 10 instructions per scalar element.
// Multiply that by the vector width.
// FIXME: remove that when PR19268 is fixed.
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
// This node is expanded into scalarized operations but BasicTTI is overly
@ -705,6 +715,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
// should be factored in too. Inflating the cost per element by 1.
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
{ ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
};
static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {

View File

@ -238,21 +238,21 @@ define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
;CHECK-LABEL: for function 'fp_conv'
; CHECK-AVX512: cost of 1 {{.*}} fpext
%A1 = fpext <8 x float> %a to <8 x double>
; CHECK: cost of 1 {{.*}} %A1 = fpext
%A1 = fpext <4 x float> %c to <4 x double>
; CHECK-AVX512: cost of 1 {{.*}} fpext
%A2 = fpext <4 x float> %c to <4 x double>
; CHECK-AVX: cost of 3 {{.*}} %A2 = fpext
; CHECK-AVX2: cost of 3 {{.*}} %A2 = fpext
; CHECK-AVX512: cost of 1 {{.*}} %A2 = fpext
%A2 = fpext <8 x float> %a to <8 x double>
; CHECK-AVX2: cost of 3 {{.*}} %A3 = fpext
; CHECK-AVX512: cost of 1 {{.*}} %A3 = fpext
%A3 = fpext <8 x float> %a to <8 x double>
; CHECK: cost of 1 {{.*}} %A3 = fptrunc
%A3 = fptrunc <4 x double> undef to <4 x float>
; CHECK-AVX: cost of 3 {{.*}} %A4 = fptrunc
; CHECK-AVX2: cost of 3 {{.*}} %A4 = fptrunc
; CHECK-AVX512: cost of 1 {{.*}} %A4 = fptrunc
%A4 = fptrunc <8 x double> undef to <8 x float>
; CHECK-AVX512: cost of 1 {{.*}} %A5 = fptrunc
%A5 = fptrunc <4 x double> undef to <4 x float>
ret void
}

View File

@ -264,13 +264,13 @@ define <4 x double> @sitofpv4i64v4double(<4 x i64> %a) {
; SSE2: cost of 40 {{.*}} sitofp
;
; AVX1-LABEL: sitofpv4i64v4double
; AVX1: cost of 10 {{.*}} sitofp
; AVX1: cost of 13 {{.*}} sitofp
;
; AVX2-LABEL: sitofpv4i64v4double
; AVX2: cost of 10 {{.*}} sitofp
; AVX2: cost of 13 {{.*}} sitofp
;
; AVX512F-LABEL: sitofpv4i64v4double
; AVX512F: cost of 10 {{.*}} sitofp
; AVX512F: cost of 13 {{.*}} sitofp
%1 = sitofp <4 x i64> %a to <4 x double>
ret <4 x double> %1
}
@ -280,10 +280,10 @@ define <8 x double> @sitofpv8i64v8double(<8 x i64> %a) {
; SSE2: cost of 80 {{.*}} sitofp
;
; AVX1-LABEL: sitofpv8i64v8double
; AVX1: cost of 21 {{.*}} sitofp
; AVX1: cost of 27 {{.*}} sitofp
;
; AVX2-LABEL: sitofpv8i64v8double
; AVX2: cost of 21 {{.*}} sitofp
; AVX2: cost of 27 {{.*}} sitofp
;
; AVX512F-LABEL: sitofpv8i64v8double
; AVX512F: cost of 22 {{.*}} sitofp
@ -296,10 +296,10 @@ define <16 x double> @sitofpv16i64v16double(<16 x i64> %a) {
; SSE2: cost of 160 {{.*}} sitofp
;
; AVX1-LABEL: sitofpv16i64v16double
; AVX1: cost of 43 {{.*}} sitofp
; AVX1: cost of 55 {{.*}} sitofp
;
; AVX2-LABEL: sitofpv16i64v16double
; AVX2: cost of 43 {{.*}} sitofp
; AVX2: cost of 55 {{.*}} sitofp
;
; AVX512F-LABEL: sitofpv16i64v16double
; AVX512F: cost of 45 {{.*}} sitofp
@ -312,10 +312,10 @@ define <32 x double> @sitofpv32i64v32double(<32 x i64> %a) {
; SSE2: cost of 320 {{.*}} sitofp
;
; AVX1-LABEL: sitofpv32i64v32double
; AVX1: cost of 87 {{.*}} sitofp
; AVX1: cost of 111 {{.*}} sitofp
;
; AVX2-LABEL: sitofpv32i64v32double
; AVX2: cost of 87 {{.*}} sitofp
; AVX2: cost of 111 {{.*}} sitofp
;
; AVX512F-LABEL: sitofpv32i64v32double
; AVX512F: cost of 91 {{.*}} sitofp

View File

@ -169,13 +169,13 @@ define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) {
; SSE2: cost of 20 {{.*}} uitofp
;
; AVX1-LABEL: uitofpv2i32v2double
; AVX1: cost of 4 {{.*}} uitofp
; AVX1: cost of 6 {{.*}} uitofp
;
; AVX2-LABEL: uitofpv2i32v2double
; AVX2: cost of 4 {{.*}} uitofp
; AVX2: cost of 6 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv2i32v2double
; AVX512F: cost of 4 {{.*}} uitofp
; AVX512F: cost of 1 {{.*}} uitofp
%1 = uitofp <2 x i32> %a to <2 x double>
ret <2 x double> %1
}
@ -249,10 +249,10 @@ define <2 x double> @uitofpv2i64v2double(<2 x i64> %a) {
; SSE2: cost of 20 {{.*}} uitofp
;
; AVX1-LABEL: uitofpv2i64v2double
; AVX1: cost of 20 {{.*}} uitofp
; AVX1: cost of 10 {{.*}} uitofp
;
; AVX2-LABEL: uitofpv2i64v2double
; AVX2: cost of 20 {{.*}} uitofp
; AVX2: cost of 10 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv2i64v2double
; AVX512F: cost of 5 {{.*}} uitofp
@ -268,10 +268,10 @@ define <4 x double> @uitofpv4i64v4double(<4 x i64> %a) {
; SSE2: cost of 40 {{.*}} uitofp
;
; AVX1-LABEL: uitofpv4i64v4double
; AVX1: cost of 40 {{.*}} uitofp
; AVX1: cost of 20 {{.*}} uitofp
;
; AVX2-LABEL: uitofpv4i64v4double
; AVX2: cost of 40 {{.*}} uitofp
; AVX2: cost of 20 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv4i64v4double
; AVX512F: cost of 12 {{.*}} uitofp
@ -287,10 +287,10 @@ define <8 x double> @uitofpv8i64v8double(<8 x i64> %a) {
; SSE2: cost of 80 {{.*}} uitofp
;
; AVX1-LABEL: uitofpv8i64v8double
; AVX1: cost of 81 {{.*}} uitofp
; AVX1: cost of 41 {{.*}} uitofp
;
; AVX2-LABEL: uitofpv8i64v8double
; AVX2: cost of 81 {{.*}} uitofp
; AVX2: cost of 41 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv8i64v8double
; AVX512F: cost of 26 {{.*}} uitofp
@ -306,10 +306,10 @@ define <16 x double> @uitofpv16i64v16double(<16 x i64> %a) {
; SSE2: cost of 160 {{.*}} uitofp
;
; AVX1-LABEL: uitofpv16i64v16double
; AVX1: cost of 163 {{.*}} uitofp
; AVX1: cost of 83 {{.*}} uitofp
;
; AVX2-LABEL: uitofpv16i64v16double
; AVX2: cost of 163 {{.*}} uitofp
; AVX2: cost of 83 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv16i64v16double
; AVX512F: cost of 53 {{.*}} uitofp
@ -325,10 +325,10 @@ define <32 x double> @uitofpv32i64v32double(<32 x i64> %a) {
; SSE2: cost of 320 {{.*}} uitofp
;
; AVX1-LABEL: uitofpv32i64v32double
; AVX1: cost of 327 {{.*}} uitofp
; AVX1: cost of 167 {{.*}} uitofp
;
; AVX2-LABEL: uitofpv32i64v32double
; AVX2: cost of 327 {{.*}} uitofp
; AVX2: cost of 167 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv32i64v32double
; AVX512F: cost of 107 {{.*}} uitofp
@ -590,7 +590,7 @@ define <2 x float> @uitofpv2i64v2float(<2 x i64> %a) {
; AVX2: cost of 4 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv2i64v2float
; AVX512F: cost of 4 {{.*}} uitofp
; AVX512F: cost of 5 {{.*}} uitofp
%1 = uitofp <2 x i64> %a to <2 x float>
ret <2 x float> %1
}
@ -622,7 +622,7 @@ define <8 x float> @uitofpv8i64v8float(<8 x i64> %a) {
; AVX2: cost of 21 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv8i64v8float
; AVX512F: cost of 22 {{.*}} uitofp
; AVX512F: cost of 26 {{.*}} uitofp
%1 = uitofp <8 x i64> %a to <8 x float>
ret <8 x float> %1
}
@ -638,7 +638,7 @@ define <16 x float> @uitofpv16i64v16float(<16 x i64> %a) {
; AVX2: cost of 43 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv16i64v16float
; AVX512F: cost of 45 {{.*}} uitofp
; AVX512F: cost of 53 {{.*}} uitofp
%1 = uitofp <16 x i64> %a to <16 x float>
ret <16 x float> %1
}
@ -654,7 +654,7 @@ define <32 x float> @uitofpv32i64v32float(<32 x i64> %a) {
; AVX2: cost of 87 {{.*}} uitofp
;
; AVX512F-LABEL: uitofpv32i64v32float
; AVX512F: cost of 91 {{.*}} uitofp
; AVX512F: cost of 107 {{.*}} uitofp
%1 = uitofp <32 x i64> %a to <32 x float>
ret <32 x float> %1
}

View File

@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "x86_64-apple-macosx10.8.0"
; CHECK: cost of 20 for VF 2 For instruction: %conv = uitofp i64 %tmp to double
; CHECK: cost of 40 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
; CHECK: cost of 10 for VF 2 For instruction: %conv = uitofp i64 %tmp to double
; CHECK: cost of 20 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
entry:
br label %for.body