mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:43:36 +01:00
Masked load and store codegen - fixed 128-bit vectors
The codegen failed on 128-bit types on AVX2. I added patterns and in td files and tests. llvm-svn: 224647
This commit is contained in:
parent
a3a9080dce
commit
744da8554e
@ -1314,23 +1314,19 @@ void X86TargetLowering::resetOperationActions() {
|
||||
i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
|
||||
MVT VT = (MVT::SimpleValueType)i;
|
||||
|
||||
if (VT.getScalarSizeInBits() >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Legal);
|
||||
setOperationAction(ISD::MSTORE, VT, Legal);
|
||||
}
|
||||
// Extract subvector is special because the value type
|
||||
// (result) is 128-bit but the source is 256-bit wide.
|
||||
if (VT.is128BitVector()) {
|
||||
if (VT.getScalarSizeInBits() >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Custom);
|
||||
setOperationAction(ISD::MSTORE, VT, Custom);
|
||||
}
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
|
||||
}
|
||||
// Do not attempt to custom lower other non-256-bit vectors
|
||||
if (!VT.is256BitVector())
|
||||
continue;
|
||||
|
||||
if (VT.getScalarSizeInBits() >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Legal);
|
||||
setOperationAction(ISD::MSTORE, VT, Legal);
|
||||
}
|
||||
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
|
||||
@ -1499,10 +1495,6 @@ void X86TargetLowering::resetOperationActions() {
|
||||
// (result) is 256/128-bit but the source is 512-bit wide.
|
||||
if (VT.is128BitVector() || VT.is256BitVector()) {
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
|
||||
if ( EltSize >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Legal);
|
||||
setOperationAction(ISD::MSTORE, VT, Legal);
|
||||
}
|
||||
}
|
||||
if (VT.getVectorElementType() == MVT::i1)
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
|
||||
|
@ -2211,6 +2211,11 @@ def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask,
|
||||
def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))),
|
||||
(VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))),
|
||||
(v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk
|
||||
(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm),
|
||||
(v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
|
||||
|
||||
defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32",
|
||||
"16", "8", "4", SSEPackedInt, HasAVX512>,
|
||||
avx512_store_vl<0x7F, "vmovdqa32", "alignedstore",
|
||||
|
@ -8966,20 +8966,26 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
|
||||
int_x86_avx2_maskstore_q_256>, VEX_W;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
|
||||
(VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
(VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
|
||||
(VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
|
||||
(VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
|
||||
(VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
|
||||
(VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
|
||||
(VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
|
||||
(bc_v8f32 (v8i32 immAllZerosV)))),
|
||||
(VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
|
||||
(VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
|
||||
(VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
|
||||
(VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
|
||||
@ -8992,21 +8998,42 @@ def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0)
|
||||
(VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
|
||||
(VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
|
||||
(bc_v4f32 (v4i32 immAllZerosV)))),
|
||||
(VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
|
||||
(VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
|
||||
VR128:$mask)>;
|
||||
|
||||
def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
|
||||
(VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
|
||||
(VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
|
||||
(VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
|
||||
VR128:$mask)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
|
||||
(VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
(VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
|
||||
(VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
|
||||
(VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
|
||||
(VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
|
||||
(v4f64 immAllZerosV))),
|
||||
(VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
|
||||
(VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
|
||||
(VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
|
||||
(VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
|
||||
@ -9020,6 +9047,33 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0)
|
||||
(VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
|
||||
(VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
|
||||
(VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
|
||||
|
||||
def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
|
||||
(VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
|
||||
(v2f64 immAllZerosV))),
|
||||
(VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
|
||||
(VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
|
||||
VR128:$mask)>;
|
||||
|
||||
def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
|
||||
(VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
|
||||
(bc_v2i64 (v4i32 immAllZerosV)))),
|
||||
(VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
|
||||
(VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
|
||||
VR128:$mask)>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Variable Bit Shifts
|
||||
|
@ -41,8 +41,8 @@ define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) {
|
||||
; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}}
|
||||
|
||||
; AVX2-LABEL: test4
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2: vmaskmovps {{.*}}(%rdi)
|
||||
; AVX2: vmaskmovps {{.*}}(%rdi)
|
||||
; AVX2: blend
|
||||
define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
@ -54,9 +54,9 @@ define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) {
|
||||
; AVX512: vmovupd (%rdi), %zmm1 {%k1}
|
||||
|
||||
; AVX2-LABEL: test5
|
||||
; AVX2: vpmaskmovq
|
||||
; AVX2: vmaskmovpd
|
||||
; AVX2: vblendvpd
|
||||
; AVX2: vpmaskmovq
|
||||
; AVX2: vmaskmovpd
|
||||
; AVX2: vblendvpd
|
||||
define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) {
|
||||
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
|
||||
@ -64,10 +64,80 @@ define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) {
|
||||
ret <8 x double> %res
|
||||
}
|
||||
|
||||
declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
|
||||
declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
|
||||
declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>)
|
||||
declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>)
|
||||
declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>)
|
||||
; AVX2-LABEL: test6
|
||||
; AVX2: vmaskmovpd
|
||||
; AVX2: vblendvpd
|
||||
define <2 x double> @test6(<2 x i64> %trigger, i8* %addr, <2 x double> %dst) {
|
||||
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
|
||||
%res = call <2 x double> @llvm.masked.load.v2f64(i8* %addr, <2 x double>%dst, i32 4, <2 x i1>%mask)
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
; AVX2-LABEL: test7
|
||||
; AVX2: vmaskmovps {{.*}}(%rdi)
|
||||
; AVX2: blend
|
||||
define <4 x float> @test7(<4 x i32> %trigger, i8* %addr, <4 x float> %dst) {
|
||||
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
|
||||
%res = call <4 x float> @llvm.masked.load.v4f32(i8* %addr, <4 x float>%dst, i32 4, <4 x i1>%mask)
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
; AVX2-LABEL: test8
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2: blend
|
||||
define <4 x i32> @test8(<4 x i32> %trigger, i8* %addr, <4 x i32> %dst) {
|
||||
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
|
||||
%res = call <4 x i32> @llvm.masked.load.v4i32(i8* %addr, <4 x i32>%dst, i32 4, <4 x i1>%mask)
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
|
||||
; AVX2-LABEL: test9
|
||||
; AVX2: vpmaskmovd %xmm
|
||||
define void @test9(<4 x i32> %trigger, i8* %addr, <4 x i32> %val) {
|
||||
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
|
||||
call void @llvm.masked.store.v4i32(i8* %addr, <4 x i32>%val, i32 4, <4 x i1>%mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; AVX2-LABEL: test10
|
||||
; AVX2: vmaskmovpd (%rdi), %ymm
|
||||
; AVX2: blend
|
||||
define <4 x double> @test10(<4 x i32> %trigger, i8* %addr, <4 x double> %dst) {
|
||||
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
|
||||
%res = call <4 x double> @llvm.masked.load.v4f64(i8* %addr, <4 x double>%dst, i32 4, <4 x i1>%mask)
|
||||
ret <4 x double> %res
|
||||
}
|
||||
|
||||
; AVX2-LABEL: test11
|
||||
; AVX2: vmaskmovps
|
||||
; AVX2: vblendvps
|
||||
define <8 x float> @test11(<8 x i32> %trigger, i8* %addr, <8 x float> %dst) {
|
||||
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
|
||||
%res = call <8 x float> @llvm.masked.load.v8f32(i8* %addr, <8 x float>%dst, i32 4, <8 x i1>%mask)
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
; AVX2-LABEL: test12
|
||||
; AVX2: vpmaskmovd %ymm
|
||||
define void @test12(<8 x i32> %trigger, i8* %addr, <8 x i32> %val) {
|
||||
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
|
||||
call void @llvm.masked.store.v8i32(i8* %addr, <8 x i32>%val, i32 4, <8 x i1>%mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
|
||||
declare <4 x i32> @llvm.masked.load.v4i32(i8*, <4 x i32>, i32, <4 x i1>)
|
||||
declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
|
||||
declare void @llvm.masked.store.v8i32(i8*, <8 x i32>, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v4i32(i8*, <4 x i32>, i32, <4 x i1>)
|
||||
declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>)
|
||||
declare <8 x float> @llvm.masked.load.v8f32(i8*, <8 x float>, i32, <8 x i1>)
|
||||
declare <4 x float> @llvm.masked.load.v4f32(i8*, <4 x float>, i32, <4 x i1>)
|
||||
declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>)
|
||||
declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>)
|
||||
declare <4 x double> @llvm.masked.load.v4f64(i8*, <4 x double>, i32, <4 x i1>)
|
||||
declare <2 x double> @llvm.masked.load.v2f64(i8*, <2 x double>, i32, <2 x i1>)
|
||||
declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v2f64(i8*, <2 x double>, i32, <2 x i1>)
|
||||
declare void @llvm.masked.store.v2i64(i8*, <2 x i64>, i32, <2 x i1>)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user