diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 778573390eb..5d6c4998ecd 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3101,7 +3101,8 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) { // If this is a splitted VSELECT that was previously already handled, do // nothing. - if (Cond->getValueType(0).getScalarSizeInBits() != 1) + EVT CondVT = Cond->getValueType(0); + if (CondVT.getScalarSizeInBits() != 1) return SDValue(); EVT VSelVT = N->getValueType(0); @@ -3125,6 +3126,14 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) { EVT SetCCResVT = getSetCCResultType(SetCCOpVT); if (SetCCResVT.getScalarSizeInBits() == 1) return SDValue(); + } else if (CondVT.getScalarType() == MVT::i1) { + // If there is support for an i1 vector mask (or only scalar i1 conditions), + // don't touch. + while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal) + CondVT = TLI.getTypeToTransformTo(Ctx, CondVT); + + if (CondVT.getScalarType() == MVT::i1) + return SDValue(); } // Get the VT and operands for VSELECT, and widen if needed. diff --git a/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll new file mode 100644 index 00000000000..4ae219949c2 --- /dev/null +++ b/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll @@ -0,0 +1,52 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; Check that DAGTypeLegalizer::WidenVSELECTAndMask doesn't try to +; create vselects with i64 condition masks. + +; FIXME: Should be able to avoid intermediate vselect +; GCN-LABEL: {{^}}widen_vselect_and_mask_v4f64: +; GCN: v_cmp_u_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], +; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]] +; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]] +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}} +define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 { +bb: + %tmp = extractelement <4 x double> %arg, i64 0 + %tmp1 = fcmp uno double %tmp, 0.000000e+00 + %tmp2 = sext i1 %tmp1 to i64 + %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0 + %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1 + %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2 + %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3 + %tmp7 = fcmp une <4 x double> %arg, zeroinitializer + %tmp8 = icmp sgt <4 x i64> %tmp6, + %tmp9 = and <4 x i1> %tmp8, %tmp7 + %tmp10 = select <4 x i1> %tmp9, <4 x double> , <4 x double> zeroinitializer + store <4 x double> %tmp10, <4 x double> addrspace(1)* null, align 32 + ret void +} + +; GCN-LABEL: {{^}}widen_vselect_and_mask_v4i64: +; GCN: v_cmp_eq_u64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], +; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]] +; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]] +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}} +define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 { +bb: + %tmp = extractelement <4 x i64> %arg, i64 0 + %tmp1 = icmp eq i64 %tmp, 0 + %tmp2 = sext i1 %tmp1 to i64 + %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0 + %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1 + %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2 + %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3 + %tmp7 = icmp ne <4 x i64> %arg, zeroinitializer + %tmp8 = icmp sgt <4 x i64> %tmp6, + %tmp9 = and <4 x i1> %tmp8, %tmp7 + %tmp10 = select <4 x i1> %tmp9, <4 x i64> , <4 x i64> zeroinitializer + store <4 x i64> %tmp10, <4 x i64> addrspace(1)* null, align 32 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll index b9f240909d8..5fc84a0aa81 100644 --- a/test/CodeGen/X86/avx512-vselect.ll +++ b/test/CodeGen/X86/avx512-vselect.ll @@ -23,35 +23,16 @@ entry: ; both formulations of vselect. All of this trickery is because we can't ; directly form an SDAG input to the lowering. define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) { -; CHECK-SKX-LABEL: test2: -; CHECK-SKX: # BB#0: # %entry -; CHECK-SKX-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0 -; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1 -; CHECK-SKX-NEXT: korw %k1, %k0, %k0 -; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1 -; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0 -; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 -; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} -; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} -; CHECK-SKX-NEXT: retq -; -; CHECK-KNL-LABEL: test2: -; CHECK-KNL: # BB#0: # %entry -; CHECK-KNL-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0 -; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1 -; CHECK-KNL-NEXT: korw %k1, %k0, %k1 -; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} -; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} -; CHECK-KNL-NEXT: retq +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-NEXT: korw %k1, %k0, %k1 +; CHECK-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-NEXT: retq entry: %gt.m = fcmp ogt <16 x float> %x, zeroinitializer %lt.m = fcmp olt <16 x float> %y, zeroinitializer