mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[x86, InstCombine] transform x86 AVX masked loads to LLVM intrinsics
The intended effect of this patch in conjunction with: http://reviews.llvm.org/rL259392 http://reviews.llvm.org/rL260145 is that customers using the AVX intrinsics in C will benefit from combines when the load mask is constant: __m128 mload_zeros(float *f) { return _mm_maskload_ps(f, _mm_set1_epi32(0)); } __m128 mload_fakeones(float *f) { return _mm_maskload_ps(f, _mm_set1_epi32(1)); } __m128 mload_ones(float *f) { return _mm_maskload_ps(f, _mm_set1_epi32(0x80000000)); } __m128 mload_oneset(float *f) { return _mm_maskload_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0)); } ...so none of the above will actually generate a masked load for optimized code. This is the masked load counterpart to: http://reviews.llvm.org/rL262064 llvm-svn: 262269
This commit is contained in:
parent
bfb6539d9f
commit
4d045a2e91
@ -828,6 +828,39 @@ static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
|
||||
// XMM register mask efficiently, we could transform all x86 masked intrinsics
|
||||
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
|
||||
static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
|
||||
Value *Ptr = II.getOperand(0);
|
||||
Value *Mask = II.getOperand(1);
|
||||
|
||||
// Special case a zero mask since that's not a ConstantDataVector.
|
||||
// This masked load instruction does nothing, so return an undef.
|
||||
if (isa<ConstantAggregateZero>(Mask))
|
||||
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
|
||||
|
||||
auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
|
||||
if (!ConstMask)
|
||||
return nullptr;
|
||||
|
||||
// The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
|
||||
// to allow target-independent optimizations.
|
||||
|
||||
// First, cast the x86 intrinsic scalar pointer to a vector pointer to match
|
||||
// the LLVM intrinsic definition for the pointer argument.
|
||||
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
|
||||
PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
|
||||
Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
|
||||
|
||||
// Second, convert the x86 XMM integer vector mask to a vector of bools based
|
||||
// on each element's most significant bit (the sign bit).
|
||||
Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
|
||||
|
||||
CallInst *NewMaskedLoad = IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask);
|
||||
return IC.replaceInstUsesWith(II, NewMaskedLoad);
|
||||
}
|
||||
|
||||
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
|
||||
// XMM register mask efficiently, we could transform all x86 masked intrinsics
|
||||
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
|
||||
@ -854,7 +887,6 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
|
||||
// the LLVM intrinsic definition for the pointer argument.
|
||||
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
|
||||
PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
|
||||
|
||||
Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
|
||||
|
||||
// Second, convert the x86 XMM integer vector mask to a vector of bools based
|
||||
@ -1630,6 +1662,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
return replaceInstUsesWith(*II, V);
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_avx_maskload_ps:
|
||||
// TODO: Add the other masked load variants.
|
||||
if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
|
||||
return I;
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_avx_maskstore_ps:
|
||||
case Intrinsic::x86_avx_maskstore_pd:
|
||||
case Intrinsic::x86_avx_maskstore_ps_256:
|
||||
|
@ -1,5 +1,64 @@
|
||||
; RUN: opt < %s -instcombine -S | FileCheck %s
|
||||
|
||||
;; MASKED LOADS
|
||||
|
||||
; If the mask isn't constant, do nothing.
|
||||
|
||||
define <4 x float> @mload(i8* %f, <4 x i32> %mask) {
|
||||
%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
|
||||
ret <4 x float> %ld
|
||||
|
||||
; CHECK-LABEL: @mload(
|
||||
; CHECK-NEXT: %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
|
||||
; CHECK-NEXT: ret <4 x float> %ld
|
||||
}
|
||||
|
||||
; Zero mask is a nop.
|
||||
|
||||
define <4 x float> @mload_zeros(i8* %f) {
|
||||
%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
|
||||
ret <4 x float> %ld
|
||||
|
||||
; CHECK-LABEL: @mload_zeros(
|
||||
; CHECK-NEXT: ret <4 x float> undef
|
||||
}
|
||||
|
||||
; Only the sign bit matters.
|
||||
|
||||
define <4 x float> @mload_fake_ones(i8* %f) {
|
||||
%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
|
||||
ret <4 x float> %ld
|
||||
|
||||
; CHECK-LABEL: @mload_fake_ones(
|
||||
; CHECK-NEXT: ret <4 x float> undef
|
||||
}
|
||||
|
||||
; All mask bits are set, so this is just a vector load.
|
||||
|
||||
define <4 x float> @mload_real_ones(i8* %f) {
|
||||
%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
|
||||
ret <4 x float> %ld
|
||||
|
||||
; CHECK-LABEL: @mload_real_ones(
|
||||
; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>*
|
||||
; CHECK-NEXT: %unmaskedload = load <4 x float>, <4 x float>* %castvec
|
||||
; CHECK-NEXT: ret <4 x float> %unmaskedload
|
||||
}
|
||||
|
||||
; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
|
||||
|
||||
define <4 x float> @mload_one_one(i8* %f) {
|
||||
%ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
|
||||
ret <4 x float> %ld
|
||||
|
||||
; CHECK-LABEL: @mload_one_one(
|
||||
; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>*
|
||||
; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> undef)
|
||||
; CHECK-NEXT: ret <4 x float> %1
|
||||
}
|
||||
|
||||
;; MASKED STORES
|
||||
|
||||
; If the mask isn't constant, do nothing.
|
||||
|
||||
define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
|
||||
@ -131,6 +190,10 @@ define void @mstore_v4i64(i8* %f, <4 x i64> %v) {
|
||||
; CHECK-NEXT: ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
|
||||
declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>)
|
||||
|
||||
declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>)
|
||||
|
Loading…
x
Reference in New Issue
Block a user