mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
[X86] Make some cast costs more precise
Make some AVX and AVX512 cast costs more precise. Based on part of a patch by Elena Demikhovsky (D15604). Differential Revision: http://reviews.llvm.org/D22064 llvm-svn: 275106
This commit is contained in:
parent
b281da1235
commit
7e6a08b33c
@ -547,6 +547,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
|
||||
};
|
||||
|
||||
// TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
|
||||
// 256-bit wide vectors.
|
||||
|
||||
static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
|
||||
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
|
||||
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
|
||||
@ -577,6 +580,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
|
||||
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
|
||||
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
|
||||
|
||||
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
|
||||
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
|
||||
@ -591,11 +596,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
|
||||
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
|
||||
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
|
||||
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
|
||||
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
|
||||
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
|
||||
@ -685,6 +692,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
|
||||
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
|
||||
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
|
||||
@ -693,9 +701,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
// here. We have roughly 10 instructions per scalar element.
|
||||
// Multiply that by the vector width.
|
||||
// FIXME: remove that when PR19268 is fixed.
|
||||
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 },
|
||||
|
||||
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 },
|
||||
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 },
|
||||
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
|
||||
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
|
||||
|
||||
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
|
||||
{ ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
|
||||
// This node is expanded into scalarized operations but BasicTTI is overly
|
||||
@ -705,6 +715,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
// should be factored in too. Inflating the cost per element by 1.
|
||||
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
|
||||
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
|
||||
|
||||
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
|
||||
{ ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
|
||||
};
|
||||
|
||||
static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
|
||||
|
@ -238,21 +238,21 @@ define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
|
||||
|
||||
define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
|
||||
;CHECK-LABEL: for function 'fp_conv'
|
||||
; CHECK-AVX512: cost of 1 {{.*}} fpext
|
||||
%A1 = fpext <8 x float> %a to <8 x double>
|
||||
; CHECK: cost of 1 {{.*}} %A1 = fpext
|
||||
%A1 = fpext <4 x float> %c to <4 x double>
|
||||
|
||||
; CHECK-AVX512: cost of 1 {{.*}} fpext
|
||||
%A2 = fpext <4 x float> %c to <4 x double>
|
||||
; CHECK-AVX: cost of 3 {{.*}} %A2 = fpext
|
||||
; CHECK-AVX2: cost of 3 {{.*}} %A2 = fpext
|
||||
; CHECK-AVX512: cost of 1 {{.*}} %A2 = fpext
|
||||
%A2 = fpext <8 x float> %a to <8 x double>
|
||||
|
||||
; CHECK-AVX2: cost of 3 {{.*}} %A3 = fpext
|
||||
; CHECK-AVX512: cost of 1 {{.*}} %A3 = fpext
|
||||
%A3 = fpext <8 x float> %a to <8 x double>
|
||||
; CHECK: cost of 1 {{.*}} %A3 = fptrunc
|
||||
%A3 = fptrunc <4 x double> undef to <4 x float>
|
||||
|
||||
; CHECK-AVX: cost of 3 {{.*}} %A4 = fptrunc
|
||||
; CHECK-AVX2: cost of 3 {{.*}} %A4 = fptrunc
|
||||
; CHECK-AVX512: cost of 1 {{.*}} %A4 = fptrunc
|
||||
%A4 = fptrunc <8 x double> undef to <8 x float>
|
||||
|
||||
; CHECK-AVX512: cost of 1 {{.*}} %A5 = fptrunc
|
||||
%A5 = fptrunc <4 x double> undef to <4 x float>
|
||||
ret void
|
||||
}
|
||||
|
@ -264,13 +264,13 @@ define <4 x double> @sitofpv4i64v4double(<4 x i64> %a) {
|
||||
; SSE2: cost of 40 {{.*}} sitofp
|
||||
;
|
||||
; AVX1-LABEL: sitofpv4i64v4double
|
||||
; AVX1: cost of 10 {{.*}} sitofp
|
||||
; AVX1: cost of 13 {{.*}} sitofp
|
||||
;
|
||||
; AVX2-LABEL: sitofpv4i64v4double
|
||||
; AVX2: cost of 10 {{.*}} sitofp
|
||||
; AVX2: cost of 13 {{.*}} sitofp
|
||||
;
|
||||
; AVX512F-LABEL: sitofpv4i64v4double
|
||||
; AVX512F: cost of 10 {{.*}} sitofp
|
||||
; AVX512F: cost of 13 {{.*}} sitofp
|
||||
%1 = sitofp <4 x i64> %a to <4 x double>
|
||||
ret <4 x double> %1
|
||||
}
|
||||
@ -280,10 +280,10 @@ define <8 x double> @sitofpv8i64v8double(<8 x i64> %a) {
|
||||
; SSE2: cost of 80 {{.*}} sitofp
|
||||
;
|
||||
; AVX1-LABEL: sitofpv8i64v8double
|
||||
; AVX1: cost of 21 {{.*}} sitofp
|
||||
; AVX1: cost of 27 {{.*}} sitofp
|
||||
;
|
||||
; AVX2-LABEL: sitofpv8i64v8double
|
||||
; AVX2: cost of 21 {{.*}} sitofp
|
||||
; AVX2: cost of 27 {{.*}} sitofp
|
||||
;
|
||||
; AVX512F-LABEL: sitofpv8i64v8double
|
||||
; AVX512F: cost of 22 {{.*}} sitofp
|
||||
@ -296,10 +296,10 @@ define <16 x double> @sitofpv16i64v16double(<16 x i64> %a) {
|
||||
; SSE2: cost of 160 {{.*}} sitofp
|
||||
;
|
||||
; AVX1-LABEL: sitofpv16i64v16double
|
||||
; AVX1: cost of 43 {{.*}} sitofp
|
||||
; AVX1: cost of 55 {{.*}} sitofp
|
||||
;
|
||||
; AVX2-LABEL: sitofpv16i64v16double
|
||||
; AVX2: cost of 43 {{.*}} sitofp
|
||||
; AVX2: cost of 55 {{.*}} sitofp
|
||||
;
|
||||
; AVX512F-LABEL: sitofpv16i64v16double
|
||||
; AVX512F: cost of 45 {{.*}} sitofp
|
||||
@ -312,10 +312,10 @@ define <32 x double> @sitofpv32i64v32double(<32 x i64> %a) {
|
||||
; SSE2: cost of 320 {{.*}} sitofp
|
||||
;
|
||||
; AVX1-LABEL: sitofpv32i64v32double
|
||||
; AVX1: cost of 87 {{.*}} sitofp
|
||||
; AVX1: cost of 111 {{.*}} sitofp
|
||||
;
|
||||
; AVX2-LABEL: sitofpv32i64v32double
|
||||
; AVX2: cost of 87 {{.*}} sitofp
|
||||
; AVX2: cost of 111 {{.*}} sitofp
|
||||
;
|
||||
; AVX512F-LABEL: sitofpv32i64v32double
|
||||
; AVX512F: cost of 91 {{.*}} sitofp
|
||||
|
@ -169,13 +169,13 @@ define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) {
|
||||
; SSE2: cost of 20 {{.*}} uitofp
|
||||
;
|
||||
; AVX1-LABEL: uitofpv2i32v2double
|
||||
; AVX1: cost of 4 {{.*}} uitofp
|
||||
; AVX1: cost of 6 {{.*}} uitofp
|
||||
;
|
||||
; AVX2-LABEL: uitofpv2i32v2double
|
||||
; AVX2: cost of 4 {{.*}} uitofp
|
||||
; AVX2: cost of 6 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv2i32v2double
|
||||
; AVX512F: cost of 4 {{.*}} uitofp
|
||||
; AVX512F: cost of 1 {{.*}} uitofp
|
||||
%1 = uitofp <2 x i32> %a to <2 x double>
|
||||
ret <2 x double> %1
|
||||
}
|
||||
@ -249,10 +249,10 @@ define <2 x double> @uitofpv2i64v2double(<2 x i64> %a) {
|
||||
; SSE2: cost of 20 {{.*}} uitofp
|
||||
;
|
||||
; AVX1-LABEL: uitofpv2i64v2double
|
||||
; AVX1: cost of 20 {{.*}} uitofp
|
||||
; AVX1: cost of 10 {{.*}} uitofp
|
||||
;
|
||||
; AVX2-LABEL: uitofpv2i64v2double
|
||||
; AVX2: cost of 20 {{.*}} uitofp
|
||||
; AVX2: cost of 10 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv2i64v2double
|
||||
; AVX512F: cost of 5 {{.*}} uitofp
|
||||
@ -268,10 +268,10 @@ define <4 x double> @uitofpv4i64v4double(<4 x i64> %a) {
|
||||
; SSE2: cost of 40 {{.*}} uitofp
|
||||
;
|
||||
; AVX1-LABEL: uitofpv4i64v4double
|
||||
; AVX1: cost of 40 {{.*}} uitofp
|
||||
; AVX1: cost of 20 {{.*}} uitofp
|
||||
;
|
||||
; AVX2-LABEL: uitofpv4i64v4double
|
||||
; AVX2: cost of 40 {{.*}} uitofp
|
||||
; AVX2: cost of 20 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv4i64v4double
|
||||
; AVX512F: cost of 12 {{.*}} uitofp
|
||||
@ -287,10 +287,10 @@ define <8 x double> @uitofpv8i64v8double(<8 x i64> %a) {
|
||||
; SSE2: cost of 80 {{.*}} uitofp
|
||||
;
|
||||
; AVX1-LABEL: uitofpv8i64v8double
|
||||
; AVX1: cost of 81 {{.*}} uitofp
|
||||
; AVX1: cost of 41 {{.*}} uitofp
|
||||
;
|
||||
; AVX2-LABEL: uitofpv8i64v8double
|
||||
; AVX2: cost of 81 {{.*}} uitofp
|
||||
; AVX2: cost of 41 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv8i64v8double
|
||||
; AVX512F: cost of 26 {{.*}} uitofp
|
||||
@ -306,10 +306,10 @@ define <16 x double> @uitofpv16i64v16double(<16 x i64> %a) {
|
||||
; SSE2: cost of 160 {{.*}} uitofp
|
||||
;
|
||||
; AVX1-LABEL: uitofpv16i64v16double
|
||||
; AVX1: cost of 163 {{.*}} uitofp
|
||||
; AVX1: cost of 83 {{.*}} uitofp
|
||||
;
|
||||
; AVX2-LABEL: uitofpv16i64v16double
|
||||
; AVX2: cost of 163 {{.*}} uitofp
|
||||
; AVX2: cost of 83 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv16i64v16double
|
||||
; AVX512F: cost of 53 {{.*}} uitofp
|
||||
@ -325,10 +325,10 @@ define <32 x double> @uitofpv32i64v32double(<32 x i64> %a) {
|
||||
; SSE2: cost of 320 {{.*}} uitofp
|
||||
;
|
||||
; AVX1-LABEL: uitofpv32i64v32double
|
||||
; AVX1: cost of 327 {{.*}} uitofp
|
||||
; AVX1: cost of 167 {{.*}} uitofp
|
||||
;
|
||||
; AVX2-LABEL: uitofpv32i64v32double
|
||||
; AVX2: cost of 327 {{.*}} uitofp
|
||||
; AVX2: cost of 167 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv32i64v32double
|
||||
; AVX512F: cost of 107 {{.*}} uitofp
|
||||
@ -590,7 +590,7 @@ define <2 x float> @uitofpv2i64v2float(<2 x i64> %a) {
|
||||
; AVX2: cost of 4 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv2i64v2float
|
||||
; AVX512F: cost of 4 {{.*}} uitofp
|
||||
; AVX512F: cost of 5 {{.*}} uitofp
|
||||
%1 = uitofp <2 x i64> %a to <2 x float>
|
||||
ret <2 x float> %1
|
||||
}
|
||||
@ -622,7 +622,7 @@ define <8 x float> @uitofpv8i64v8float(<8 x i64> %a) {
|
||||
; AVX2: cost of 21 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv8i64v8float
|
||||
; AVX512F: cost of 22 {{.*}} uitofp
|
||||
; AVX512F: cost of 26 {{.*}} uitofp
|
||||
%1 = uitofp <8 x i64> %a to <8 x float>
|
||||
ret <8 x float> %1
|
||||
}
|
||||
@ -638,7 +638,7 @@ define <16 x float> @uitofpv16i64v16float(<16 x i64> %a) {
|
||||
; AVX2: cost of 43 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv16i64v16float
|
||||
; AVX512F: cost of 45 {{.*}} uitofp
|
||||
; AVX512F: cost of 53 {{.*}} uitofp
|
||||
%1 = uitofp <16 x i64> %a to <16 x float>
|
||||
ret <16 x float> %1
|
||||
}
|
||||
@ -654,7 +654,7 @@ define <32 x float> @uitofpv32i64v32float(<32 x i64> %a) {
|
||||
; AVX2: cost of 87 {{.*}} uitofp
|
||||
;
|
||||
; AVX512F-LABEL: uitofpv32i64v32float
|
||||
; AVX512F: cost of 91 {{.*}} uitofp
|
||||
; AVX512F: cost of 107 {{.*}} uitofp
|
||||
%1 = uitofp <32 x i64> %a to <32 x float>
|
||||
ret <32 x float> %1
|
||||
}
|
||||
|
@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
|
||||
; CHECK: cost of 20 for VF 2 For instruction: %conv = uitofp i64 %tmp to double
|
||||
; CHECK: cost of 40 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
|
||||
; CHECK: cost of 10 for VF 2 For instruction: %conv = uitofp i64 %tmp to double
|
||||
; CHECK: cost of 20 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
|
||||
define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
Loading…
x
Reference in New Issue
Block a user