mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 13:11:39 +01:00
194b0d60e7
Summary: These intrinsic instructions are all selected from intrinsics that have well defined behavior for where the upper bits come from. It's not the same place as the lower bits. As you can see we were suppressing load folding for these instructions in some cases. In none of the cases was the separate load helping avoid a partial dependency on the destination register. So we should just go ahead and allow the load to be folded. Only foldMemoryOperand was suppressing folding for these. They all have patterns for folding sse_load_f32/f64 that aren't gated with OptForSize, but sse_load_f32/f64 doesn't allow 128-bit vector loads. It only allows scalar_to_vector and vzmovl of scalar loads to match. There's no reason we can't allow a 128-bit vector load to be narrowed so I would like to fix sse_load_f32/f64 to allow that. And if I do that it changes some of these same test cases to fold the load too. Reviewers: spatel, zvi, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D27611 llvm-svn: 289419
133 lines
5.0 KiB
LLVM
133 lines
5.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
|
|
|
|
; rdar: 12558838
|
|
; PR14221
|
|
; There is a mismatch between the intrinsic and the actual instruction.
|
|
; The actual instruction has a partial update of dest, while the intrinsic
|
|
; passes through the upper FP values. Here, we make sure the source and
|
|
; destination of each scalar unary op are the same.
|
|
|
|
define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp {
|
|
; CHECK-LABEL: rsqrtss:
|
|
; CHECK: ## BB#0: ## %entry
|
|
; CHECK-NEXT: rsqrtss %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
|
|
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
|
|
; CHECK-NEXT: movaps %xmm2, %xmm0
|
|
; CHECK-NEXT: jmp _callee ## TAILCALL
|
|
entry:
|
|
|
|
%0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
|
|
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
|
|
%conv = fpext float %a.addr.0.extract to double
|
|
%a.addr.4.extract = extractelement <4 x float> %0, i32 1
|
|
%conv3 = fpext float %a.addr.4.extract to double
|
|
tail call void @callee(double %conv, double %conv3) nounwind
|
|
ret void
|
|
}
|
|
declare void @callee(double, double)
|
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
|
|
|
define void @rcpss(<4 x float> %a) nounwind uwtable ssp {
|
|
; CHECK-LABEL: rcpss:
|
|
; CHECK: ## BB#0: ## %entry
|
|
; CHECK-NEXT: rcpss %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
|
|
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
|
|
; CHECK-NEXT: movaps %xmm2, %xmm0
|
|
; CHECK-NEXT: jmp _callee ## TAILCALL
|
|
entry:
|
|
|
|
%0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
|
|
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
|
|
%conv = fpext float %a.addr.0.extract to double
|
|
%a.addr.4.extract = extractelement <4 x float> %0, i32 1
|
|
%conv3 = fpext float %a.addr.4.extract to double
|
|
tail call void @callee(double %conv, double %conv3) nounwind
|
|
ret void
|
|
}
|
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
|
|
|
define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
|
|
; CHECK-LABEL: sqrtss:
|
|
; CHECK: ## BB#0: ## %entry
|
|
; CHECK-NEXT: sqrtss %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
|
|
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
|
|
; CHECK-NEXT: movaps %xmm2, %xmm0
|
|
; CHECK-NEXT: jmp _callee ## TAILCALL
|
|
entry:
|
|
|
|
%0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind
|
|
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
|
|
%conv = fpext float %a.addr.0.extract to double
|
|
%a.addr.4.extract = extractelement <4 x float> %0, i32 1
|
|
%conv3 = fpext float %a.addr.4.extract to double
|
|
tail call void @callee(double %conv, double %conv3) nounwind
|
|
ret void
|
|
}
|
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
|
|
|
define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
|
|
; CHECK-LABEL: sqrtsd:
|
|
; CHECK: ## BB#0: ## %entry
|
|
; CHECK-NEXT: sqrtsd %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2
|
|
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
|
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
|
|
; CHECK-NEXT: movaps %xmm2, %xmm0
|
|
; CHECK-NEXT: jmp _callee2 ## TAILCALL
|
|
entry:
|
|
|
|
%0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind
|
|
%a0 = extractelement <2 x double> %0, i32 0
|
|
%conv = fptrunc double %a0 to float
|
|
%a1 = extractelement <2 x double> %0, i32 1
|
|
%conv3 = fptrunc double %a1 to float
|
|
tail call void @callee2(float %conv, float %conv3) nounwind
|
|
ret void
|
|
}
|
|
|
|
declare void @callee2(float, float)
|
|
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
|
|
|
define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
|
|
; CHECK-LABEL: load_fold_cvtss2sd_int:
|
|
; CHECK: ## BB#0:
|
|
; CHECK-NEXT: xorps %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
|
|
; CHECK-NEXT: retq
|
|
%ld = load <4 x float>, <4 x float> *%a
|
|
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
|
|
ret <2 x double> %x
|
|
}
|
|
|
|
define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
|
|
; CHECK-LABEL: load_fold_cvtss2sd_int_optsize:
|
|
; CHECK: ## BB#0:
|
|
; CHECK-NEXT: xorps %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
|
|
; CHECK-NEXT: retq
|
|
%ld = load <4 x float>, <4 x float> *%a
|
|
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
|
|
ret <2 x double> %x
|
|
}
|
|
|
|
define <2 x double> @load_fold_cvtss2sd_int_minsize(<4 x float> *%a) minsize {
|
|
; CHECK-LABEL: load_fold_cvtss2sd_int_minsize:
|
|
; CHECK: ## BB#0:
|
|
; CHECK-NEXT: xorps %xmm0, %xmm0
|
|
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
|
|
; CHECK-NEXT: retq
|
|
%ld = load <4 x float>, <4 x float> *%a
|
|
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
|
|
ret <2 x double> %x
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
|
|
|