mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
[X86] Fix SQRTSS/SQRTSD/RCPSS/RCPSD intrinsics to use sse_load_f32/sse_load_f64 to increase load folding opportunities.
llvm-svn: 318016
This commit is contained in:
parent
a400aaf166
commit
64463b58e5
@ -7588,11 +7588,10 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
|
|||||||
(_.VT _.RC:$src2),
|
(_.VT _.RC:$src2),
|
||||||
(i32 FROUND_CURRENT))>;
|
(i32 FROUND_CURRENT))>;
|
||||||
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||||
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
|
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
|
||||||
"$src2, $src1", "$src1, $src2",
|
"$src2, $src1", "$src1, $src2",
|
||||||
(X86fsqrtRnds (_.VT _.RC:$src1),
|
(X86fsqrtRnds (_.VT _.RC:$src1),
|
||||||
(_.VT (scalar_to_vector
|
_.ScalarIntMemCPat:$src2,
|
||||||
(_.ScalarLdFrag addr:$src2))),
|
|
||||||
(i32 FROUND_CURRENT))>;
|
(i32 FROUND_CURRENT))>;
|
||||||
|
|
||||||
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
||||||
@ -7630,7 +7629,7 @@ let Predicates = [HasAVX512, OptForSize] in {
|
|||||||
(!cast<Instruction>(NAME#SUFF#Zm)
|
(!cast<Instruction>(NAME#SUFF#Zm)
|
||||||
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
|
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
|
||||||
|
|
||||||
def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))),
|
def : Pat<(Intr _.ScalarIntMemCPat:$src2),
|
||||||
(!cast<Instruction>(NAME#SUFF#Zm_Int)
|
(!cast<Instruction>(NAME#SUFF#Zm_Int)
|
||||||
(_.VT (IMPLICIT_DEF)), addr:$src2)>;
|
(_.VT (IMPLICIT_DEF)), addr:$src2)>;
|
||||||
}
|
}
|
||||||
|
@ -3040,6 +3040,7 @@ def SSE_RCPS : OpndItins<
|
|||||||
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||||
ValueType vt, ValueType ScalarVT,
|
ValueType vt, ValueType ScalarVT,
|
||||||
X86MemOperand x86memop,
|
X86MemOperand x86memop,
|
||||||
|
Operand intmemop, ComplexPattern int_cpat,
|
||||||
Intrinsic Intr,
|
Intrinsic Intr,
|
||||||
SDNode OpNode, Domain d, OpndItins itins,
|
SDNode OpNode, Domain d, OpndItins itins,
|
||||||
Predicate target, string Suffix> {
|
Predicate target, string Suffix> {
|
||||||
@ -3060,7 +3061,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
|||||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||||
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||||
let mayLoad = 1 in
|
let mayLoad = 1 in
|
||||||
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
|
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
|
||||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||||
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||||
}
|
}
|
||||||
@ -3080,7 +3081,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
|||||||
// which has a clobber before the rcp, vs.
|
// which has a clobber before the rcp, vs.
|
||||||
// rcpss mem, %xmm0
|
// rcpss mem, %xmm0
|
||||||
let Predicates = [target, OptForSize] in {
|
let Predicates = [target, OptForSize] in {
|
||||||
def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
|
def : Pat<(Intr int_cpat:$src2),
|
||||||
(!cast<Instruction>(NAME#Suffix##m_Int)
|
(!cast<Instruction>(NAME#Suffix##m_Int)
|
||||||
(vt (IMPLICIT_DEF)), addr:$src2)>;
|
(vt (IMPLICIT_DEF)), addr:$src2)>;
|
||||||
}
|
}
|
||||||
@ -3089,6 +3090,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
|||||||
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||||
ValueType vt, ValueType ScalarVT,
|
ValueType vt, ValueType ScalarVT,
|
||||||
X86MemOperand x86memop,
|
X86MemOperand x86memop,
|
||||||
|
Operand intmemop, ComplexPattern int_cpat,
|
||||||
Intrinsic Intr, SDNode OpNode, Domain d,
|
Intrinsic Intr, SDNode OpNode, Domain d,
|
||||||
OpndItins itins, Predicate target, string Suffix> {
|
OpndItins itins, Predicate target, string Suffix> {
|
||||||
let hasSideEffects = 0 in {
|
let hasSideEffects = 0 in {
|
||||||
@ -3106,7 +3108,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
|||||||
[]>, Sched<[itins.Sched.Folded]>;
|
[]>, Sched<[itins.Sched.Folded]>;
|
||||||
let mayLoad = 1 in
|
let mayLoad = 1 in
|
||||||
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
|
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
|
||||||
(ins VR128:$src1, x86memop:$src2),
|
(ins VR128:$src1, intmemop:$src2),
|
||||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||||
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||||
}
|
}
|
||||||
@ -3129,7 +3131,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
|||||||
VR128:$src)>;
|
VR128:$src)>;
|
||||||
}
|
}
|
||||||
let Predicates = [target, OptForSize] in {
|
let Predicates = [target, OptForSize] in {
|
||||||
def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
|
def : Pat<(Intr int_cpat:$src2),
|
||||||
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
|
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
|
||||||
(vt (IMPLICIT_DEF)), addr:$src2)>;
|
(vt (IMPLICIT_DEF)), addr:$src2)>;
|
||||||
def : Pat<(ScalarVT (OpNode (load addr:$src))),
|
def : Pat<(ScalarVT (OpNode (load addr:$src))),
|
||||||
@ -3213,10 +3215,11 @@ let Predicates = [HasAVX, NoVLX] in {
|
|||||||
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||||
OpndItins itins, Predicate AVXTarget> {
|
OpndItins itins, Predicate AVXTarget> {
|
||||||
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
|
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
|
||||||
|
ssmem, sse_load_f32,
|
||||||
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
|
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
|
||||||
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
|
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
|
||||||
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
|
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
|
||||||
f32mem,
|
f32mem, ssmem, sse_load_f32,
|
||||||
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
|
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
|
||||||
SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
|
SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
|
||||||
VEX_LIG, VEX_WIG, NotMemoryFoldable;
|
VEX_LIG, VEX_WIG, NotMemoryFoldable;
|
||||||
@ -3225,10 +3228,11 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
|||||||
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||||
OpndItins itins, Predicate AVXTarget> {
|
OpndItins itins, Predicate AVXTarget> {
|
||||||
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
|
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
|
||||||
|
sdmem, sse_load_f64,
|
||||||
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
|
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
|
||||||
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
|
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
|
||||||
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
|
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
|
||||||
f64mem,
|
f64mem, sdmem, sse_load_f64,
|
||||||
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
|
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
|
||||||
OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
|
OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
|
||||||
XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
|
XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
|
||||||
|
@ -101,14 +101,12 @@ define float @rcpss_size(float* %a) optsize {
|
|||||||
define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
|
define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
|
||||||
; SSE-LABEL: rcpss_full_size:
|
; SSE-LABEL: rcpss_full_size:
|
||||||
; SSE: # BB#0:
|
; SSE: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE-NEXT: rcpss (%rdi), %xmm0
|
||||||
; SSE-NEXT: rcpss %xmm0, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: rcpss_full_size:
|
; AVX-LABEL: rcpss_full_size:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%ld = load <4 x float>, <4 x float>* %a
|
%ld = load <4 x float>, <4 x float>* %a
|
||||||
%res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
|
%res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
|
||||||
@ -135,14 +133,12 @@ define float @rsqrtss_size(float* %a) optsize {
|
|||||||
define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
|
define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
|
||||||
; SSE-LABEL: rsqrtss_full_size:
|
; SSE-LABEL: rsqrtss_full_size:
|
||||||
; SSE: # BB#0:
|
; SSE: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE-NEXT: rsqrtss (%rdi), %xmm0
|
||||||
; SSE-NEXT: rsqrtss %xmm0, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: rsqrtss_full_size:
|
; AVX-LABEL: rsqrtss_full_size:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%ld = load <4 x float>, <4 x float>* %a
|
%ld = load <4 x float>, <4 x float>* %a
|
||||||
%res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
|
%res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
|
||||||
@ -169,14 +165,12 @@ define float @sqrtss_size(float* %a) optsize{
|
|||||||
define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
|
define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
|
||||||
; SSE-LABEL: sqrtss_full_size:
|
; SSE-LABEL: sqrtss_full_size:
|
||||||
; SSE: # BB#0:
|
; SSE: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE-NEXT: sqrtss (%rdi), %xmm0
|
||||||
; SSE-NEXT: sqrtss %xmm0, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: sqrtss_full_size:
|
; AVX-LABEL: sqrtss_full_size:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%ld = load <4 x float>, <4 x float>* %a
|
%ld = load <4 x float>, <4 x float>* %a
|
||||||
%res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
|
%res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
|
||||||
@ -203,14 +197,12 @@ define double @sqrtsd_size(double* %a) optsize {
|
|||||||
define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
|
define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
|
||||||
; SSE-LABEL: sqrtsd_full_size:
|
; SSE-LABEL: sqrtsd_full_size:
|
||||||
; SSE: # BB#0:
|
; SSE: # BB#0:
|
||||||
; SSE-NEXT: movapd (%rdi), %xmm0
|
; SSE-NEXT: sqrtsd (%rdi), %xmm0
|
||||||
; SSE-NEXT: sqrtsd %xmm0, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: sqrtsd_full_size:
|
; AVX-LABEL: sqrtsd_full_size:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovapd (%rdi), %xmm0
|
; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%ld = load <2 x double>, <2 x double>* %a
|
%ld = load <2 x double>, <2 x double>* %a
|
||||||
%res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
|
%res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
|
||||||
|
Loading…
Reference in New Issue
Block a user