diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 6772d96c799..28c5d0449c8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2478,6 +2478,9 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + // The backend can't handle a single element vector. + if (isa(DataTy) && DataTy->getVectorNumElements() == 1) + return false; Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); @@ -2501,8 +2504,13 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { // the vector type. // The Scalarizer asks again about legality. It sends a vector type. // In this case we can reject non-power-of-2 vectors. - if (isa(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) - return false; + // We also reject single element vectors as the type legalizer can't + // scalarize it. + if (isa(DataTy)) { + unsigned NumElts = DataTy->getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return false; + } Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 70472c49f57..76113f72599 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -2400,3 +2400,79 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> , <8 x i32> undef) ret <8 x i32> %g } + +define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) { +; KNL_64-LABEL: v1_scatter: +; KNL_64: # BB#0: +; KNL_64-NEXT: testb $1, %dl +; KNL_64-NEXT: jne .LBB42_1 +; KNL_64-NEXT: # BB#2: # %else +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB42_1: # %cond.store +; KNL_64-NEXT: movl %edi, (%rsi) +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: v1_scatter: +; KNL_32: # BB#0: +; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp) +; KNL_32-NEXT: jne .LBB42_1 +; KNL_32-NEXT: # BB#2: # %else +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB42_1: # %cond.store +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; KNL_32-NEXT: movl %ecx, (%eax) +; KNL_32-NEXT: retl +; +; SKX-LABEL: v1_scatter: +; SKX: # BB#0: +; SKX-NEXT: testb $1, %dl +; SKX-NEXT: jne .LBB42_1 +; SKX-NEXT: # BB#2: # %else +; SKX-NEXT: retq +; SKX-NEXT: .LBB42_1: # %cond.store +; SKX-NEXT: movl %edi, (%rsi) +; SKX-NEXT: retq +; +; SKX_32-LABEL: v1_scatter: +; SKX_32: # BB#0: +; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp) +; SKX_32-NEXT: jne .LBB42_1 +; SKX_32-NEXT: # BB#2: # %else +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB42_1: # %cond.store +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SKX_32-NEXT: movl %ecx, (%eax) +; SKX_32-NEXT: retl + call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>) + +define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) { +; KNL_64-LABEL: v1_gather: +; KNL_64: # BB#0: +; KNL_64-NEXT: movl (%rdi), %eax +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: v1_gather: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movl (%eax), %eax +; KNL_32-NEXT: retl +; +; SKX-LABEL: v1_gather: +; SKX: # BB#0: +; SKX-NEXT: movl (%rdi), %eax +; SKX-NEXT: retq +; +; SKX_32-LABEL: v1_gather: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: movl (%eax), %eax +; SKX_32-NEXT: retl + %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> , <1 x i32> %src0) + ret <1 x i32>%res +} +declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>) diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index f43e3f6f56e..ef666ff1c41 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -8,6 +8,86 @@ ; that does not have AVX, but that case should probably be a separate test file using less tests ; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. +define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) { +; AVX-LABEL: loadv1: +; AVX: ## BB#0: +; AVX-NEXT: testq %rdi, %rdi +; AVX-NEXT: ## implicit-def: %XMM1 +; AVX-NEXT: je LBB0_1 +; AVX-NEXT: ## BB#2: ## %else +; AVX-NEXT: testq %rdi, %rdi +; AVX-NEXT: jne LBB0_3 +; AVX-NEXT: LBB0_4: ## %else +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq +; AVX-NEXT: LBB0_1: ## %cond.load +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: testq %rdi, %rdi +; AVX-NEXT: je LBB0_4 +; AVX-NEXT: LBB0_3: ## %else +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: loadv1: +; AVX512F: ## BB#0: +; AVX512F-NEXT: testq %rdi, %rdi +; AVX512F-NEXT: ## implicit-def: %XMM1 +; AVX512F-NEXT: jne LBB0_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: LBB0_2: ## %else +; AVX512F-NEXT: testq %rdi, %rdi +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: retq +; +; SKX-LABEL: loadv1: +; SKX: ## BB#0: +; SKX-NEXT: testq %rdi, %rdi +; SKX-NEXT: ## implicit-def: %XMM1 +; SKX-NEXT: jne LBB0_2 +; SKX-NEXT: ## BB#1: ## %cond.load +; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; SKX-NEXT: LBB0_2: ## %else +; SKX-NEXT: testq %rdi, %rdi +; SKX-NEXT: sete %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; SKX-NEXT: retq + %mask = icmp eq <1 x i64> %trigger, zeroinitializer + %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst) + ret <1 x double> %res +} +declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>) + +define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) { +; AVX-LABEL: storev1: +; AVX: ## BB#0: +; AVX-NEXT: testl %edi, %edi +; AVX-NEXT: je LBB1_1 +; AVX-NEXT: ## BB#2: ## %else +; AVX-NEXT: retq +; AVX-NEXT: LBB1_1: ## %cond.store +; AVX-NEXT: movl %edx, (%rsi) +; AVX-NEXT: retq +; +; AVX512-LABEL: storev1: +; AVX512: ## BB#0: +; AVX512-NEXT: testl %edi, %edi +; AVX512-NEXT: je LBB1_1 +; AVX512-NEXT: ## BB#2: ## %else +; AVX512-NEXT: retq +; AVX512-NEXT: LBB1_1: ## %cond.store +; AVX512-NEXT: movl %edx, (%rsi) +; AVX512-NEXT: retq + %mask = icmp eq <1 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask) + ret void +} +declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>) + define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { ; AVX-LABEL: test6: ; AVX: ## BB#0: