mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
[X86] Fix SQRTSS/SQRTSD/RCPSS/RCPSD intrinsics to use sse_load_f32/sse_load_f64 to increase load folding opportunities.
llvm-svn: 318016
This commit is contained in:
parent
a400aaf166
commit
64463b58e5
@ -7588,11 +7588,10 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
|
||||
(_.VT _.RC:$src2),
|
||||
(i32 FROUND_CURRENT))>;
|
||||
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
|
||||
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
|
||||
"$src2, $src1", "$src1, $src2",
|
||||
(X86fsqrtRnds (_.VT _.RC:$src1),
|
||||
(_.VT (scalar_to_vector
|
||||
(_.ScalarLdFrag addr:$src2))),
|
||||
_.ScalarIntMemCPat:$src2,
|
||||
(i32 FROUND_CURRENT))>;
|
||||
|
||||
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
||||
@ -7630,7 +7629,7 @@ let Predicates = [HasAVX512, OptForSize] in {
|
||||
(!cast<Instruction>(NAME#SUFF#Zm)
|
||||
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
|
||||
|
||||
def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))),
|
||||
def : Pat<(Intr _.ScalarIntMemCPat:$src2),
|
||||
(!cast<Instruction>(NAME#SUFF#Zm_Int)
|
||||
(_.VT (IMPLICIT_DEF)), addr:$src2)>;
|
||||
}
|
||||
|
@ -3040,6 +3040,7 @@ def SSE_RCPS : OpndItins<
|
||||
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
ValueType vt, ValueType ScalarVT,
|
||||
X86MemOperand x86memop,
|
||||
Operand intmemop, ComplexPattern int_cpat,
|
||||
Intrinsic Intr,
|
||||
SDNode OpNode, Domain d, OpndItins itins,
|
||||
Predicate target, string Suffix> {
|
||||
@ -3060,7 +3061,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||
let mayLoad = 1 in
|
||||
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
|
||||
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||
}
|
||||
@ -3080,7 +3081,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
// which has a clobber before the rcp, vs.
|
||||
// rcpss mem, %xmm0
|
||||
let Predicates = [target, OptForSize] in {
|
||||
def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
|
||||
def : Pat<(Intr int_cpat:$src2),
|
||||
(!cast<Instruction>(NAME#Suffix##m_Int)
|
||||
(vt (IMPLICIT_DEF)), addr:$src2)>;
|
||||
}
|
||||
@ -3089,6 +3090,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
ValueType vt, ValueType ScalarVT,
|
||||
X86MemOperand x86memop,
|
||||
Operand intmemop, ComplexPattern int_cpat,
|
||||
Intrinsic Intr, SDNode OpNode, Domain d,
|
||||
OpndItins itins, Predicate target, string Suffix> {
|
||||
let hasSideEffects = 0 in {
|
||||
@ -3106,7 +3108,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
[]>, Sched<[itins.Sched.Folded]>;
|
||||
let mayLoad = 1 in
|
||||
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, x86memop:$src2),
|
||||
(ins VR128:$src1, intmemop:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
|
||||
}
|
||||
@ -3129,7 +3131,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
VR128:$src)>;
|
||||
}
|
||||
let Predicates = [target, OptForSize] in {
|
||||
def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
|
||||
def : Pat<(Intr int_cpat:$src2),
|
||||
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
|
||||
(vt (IMPLICIT_DEF)), addr:$src2)>;
|
||||
def : Pat<(ScalarVT (OpNode (load addr:$src))),
|
||||
@ -3213,10 +3215,11 @@ let Predicates = [HasAVX, NoVLX] in {
|
||||
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
OpndItins itins, Predicate AVXTarget> {
|
||||
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
|
||||
ssmem, sse_load_f32,
|
||||
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
|
||||
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
|
||||
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
|
||||
f32mem,
|
||||
f32mem, ssmem, sse_load_f32,
|
||||
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
|
||||
SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
|
||||
VEX_LIG, VEX_WIG, NotMemoryFoldable;
|
||||
@ -3225,10 +3228,11 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
OpndItins itins, Predicate AVXTarget> {
|
||||
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
|
||||
sdmem, sse_load_f64,
|
||||
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
|
||||
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
|
||||
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
|
||||
f64mem,
|
||||
f64mem, sdmem, sse_load_f64,
|
||||
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
|
||||
OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
|
||||
XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
|
||||
|
@ -101,14 +101,12 @@ define float @rcpss_size(float* %a) optsize {
|
||||
define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
|
||||
; SSE-LABEL: rcpss_full_size:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
||||
; SSE-NEXT: rcpss %xmm0, %xmm0
|
||||
; SSE-NEXT: rcpss (%rdi), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: rcpss_full_size:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%ld = load <4 x float>, <4 x float>* %a
|
||||
%res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
|
||||
@ -135,14 +133,12 @@ define float @rsqrtss_size(float* %a) optsize {
|
||||
define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
|
||||
; SSE-LABEL: rsqrtss_full_size:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
||||
; SSE-NEXT: rsqrtss %xmm0, %xmm0
|
||||
; SSE-NEXT: rsqrtss (%rdi), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: rsqrtss_full_size:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%ld = load <4 x float>, <4 x float>* %a
|
||||
%res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
|
||||
@ -169,14 +165,12 @@ define float @sqrtss_size(float* %a) optsize{
|
||||
define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
|
||||
; SSE-LABEL: sqrtss_full_size:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
||||
; SSE-NEXT: sqrtss %xmm0, %xmm0
|
||||
; SSE-NEXT: sqrtss (%rdi), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: sqrtss_full_size:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%ld = load <4 x float>, <4 x float>* %a
|
||||
%res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
|
||||
@ -203,14 +197,12 @@ define double @sqrtsd_size(double* %a) optsize {
|
||||
define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
|
||||
; SSE-LABEL: sqrtsd_full_size:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movapd (%rdi), %xmm0
|
||||
; SSE-NEXT: sqrtsd %xmm0, %xmm0
|
||||
; SSE-NEXT: sqrtsd (%rdi), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: sqrtsd_full_size:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovapd (%rdi), %xmm0
|
||||
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%ld = load <2 x double>, <2 x double>* %a
|
||||
%res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
|
||||
|
Loading…
Reference in New Issue
Block a user