diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 554de000b59..5df78fe3e5c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -31361,50 +31361,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// If this is a bitcasted op that can be represented as another type, push the -// the bitcast to the inputs. This allows more opportunities for pattern -// matching masked instructions. This is called when we know that the operation -// is used as one of the inputs of a vselect. -static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - // Make sure we have a bitcast. - if (OrigOp.getOpcode() != ISD::BITCAST) - return false; - - SDValue Op = OrigOp.getOperand(0); - - // If the operation is used by anything other than the bitcast, we shouldn't - // do this combine as that would replicate the operation. - if (!Op.hasOneUse()) - return false; - - MVT VT = OrigOp.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - SDLoc DL(Op.getNode()); - - unsigned Opcode = Op.getOpcode(); - switch (Opcode) { - case X86ISD::SUBV_BROADCAST: { - unsigned EltSize = EltVT.getSizeInBits(); - if (EltSize != 32 && EltSize != 64) - return false; - // Only change element size, not type. - if (VT.isInteger() != Op.getSimpleValueType().isInteger()) - return false; - SDValue Op0 = Op.getOperand(0); - MVT Op0VT = MVT::getVectorVT(EltVT, - Op0.getSimpleValueType().getSizeInBits() / EltSize); - Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0)); - DCI.AddToWorklist(Op0.getNode()); - DCI.CombineTo(OrigOp.getNode(), - DAG.getNode(Opcode, DL, VT, Op0)); - return true; - } - } - - return false; -} - /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -31770,17 +31726,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } } - // Look for vselects with LHS/RHS being bitcasted from an operation that - // can be executed on another type. Push the bitcast to the inputs of - // the operation. This exposes opportunities for using masking instructions. - if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() && - CondVT.getVectorElementType() == MVT::i1) { - if (combineBitcastForMaskedOp(LHS, DAG, DCI)) - return SDValue(N, 0); - if (combineBitcastForMaskedOp(RHS, DAG, DCI)) - return SDValue(N, 0); - } - // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index c6a404f4dfb..855487e31d2 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1487,6 +1487,41 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v16f32 (v16i32 immAllZerosV))), + (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + VR512:$src0), + (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (v16i32 immAllZerosV)), + (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + VR512:$src0), + (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + (bc_v8f64 (v16i32 immAllZerosV))), + (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + VR512:$src0), + (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))), + (bc_v8i64 (v16i32 immAllZerosV))), + (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))), + VR512:$src0), + (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } let Predicates = [HasVLX] in { @@ -1506,6 +1541,25 @@ def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), (VBROADCASTI32X4Z256rm addr:$src)>; +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK8WM:$mask, + (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v8f32 (v8i32 immAllZerosV))), + (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + VR256X:$src0), + (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (v8i32 immAllZerosV)), + (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + VR256X:$src0), + (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), @@ -1535,6 +1589,24 @@ defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2" defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v4f64x_info, v2f64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK4WM:$mask, + (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v4f64 (v8i32 immAllZerosV))), + (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; +def : Pat<(vselect VK4WM:$mask, + (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + VR256X:$src0), + (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; +def : Pat<(vselect VK4WM:$mask, + (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v4i64 (v8i32 immAllZerosV))), + (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; +def : Pat<(vselect VK4WM:$mask, + (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + VR256X:$src0), + (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; } let Predicates = [HasDQI] in { @@ -1550,6 +1622,41 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + (bc_v16f32 (v16i32 immAllZerosV))), + (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + VR512:$src0), + (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + (v16i32 immAllZerosV)), + (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + VR512:$src0), + (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v8f64 (v16i32 immAllZerosV))), + (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + VR512:$src0), + (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v8i64 (v16i32 immAllZerosV))), + (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + VR512:$src0), + (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index beb94552dbb..d54b44e1e8d 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -878,6 +878,7 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>; +def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>; // 512-bit bitconvert pattern fragments def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>; diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll index ee8ab50b588..06e948110f3 100644 --- a/test/CodeGen/X86/vector-shuffle-masked.ll +++ b/test/CodeGen/X86/vector-shuffle-masked.ll @@ -1511,8 +1511,22 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask) ret <2 x double> %res } -define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_2f64_8f32: +define <8 x float> @test_broadcast_2f64_8f32_mask(<2 x double> *%p, i8 %mask, <8 x float> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_2f64_8f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %1 = load <2 x double>, <2 x double> *%p + %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> + %3 = bitcast <4 x double> %2 to <8 x float> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %3, <8 x float> %passthru + ret <8 x float> %res +} + +define <8 x float> @test_broadcast_2f64_8f32_maskz(<2 x double> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_2f64_8f32_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] @@ -1525,8 +1539,22 @@ define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwin ret <8 x float> %res } -define <8 x i32> @test_broadcast_2i64_8i32(<2 x i64> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_2i64_8i32: +define <8 x i32> @test_broadcast_2i64_8i32_mask(<2 x i64> *%p, i8 %mask, <8 x i32> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_2i64_8i32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %1 = load <2 x i64>, <2 x i64> *%p + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> + %3 = bitcast <4 x i64> %2 to <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %3, <8 x i32> %passthru + ret <8 x i32> %res +} + +define <8 x i32> @test_broadcast_2i64_8i32_maskz(<2 x i64> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_2i64_8i32_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] @@ -1539,8 +1567,22 @@ define <8 x i32> @test_broadcast_2i64_8i32(<2 x i64> *%p, i8 %mask) nounwind { ret <8 x i32> %res } -define <16 x float> @test_broadcast_2f64_16f32(<2 x double> *%p, i16 %mask) nounwind { -; CHECK-LABEL: test_broadcast_2f64_16f32: +define <16 x float> @test_broadcast_2f64_16f32_mask(<2 x double> *%p, i16 %mask, <16 x float> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_2f64_16f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %1 = load <2 x double>, <2 x double> *%p + %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> + %3 = bitcast <8 x double> %2 to <16 x float> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %3, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x float> @test_broadcast_2f64_16f32_maskz(<2 x double> *%p, i16 %mask) nounwind { +; CHECK-LABEL: test_broadcast_2f64_16f32_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1553,8 +1595,22 @@ define <16 x float> @test_broadcast_2f64_16f32(<2 x double> *%p, i16 %mask) noun ret <16 x float> %res } -define <16 x i32> @test_broadcast_2i64_16i32(<2 x i64> *%p, i16 %mask) nounwind { -; CHECK-LABEL: test_broadcast_2i64_16i32: +define <16 x i32> @test_broadcast_2i64_16i32_mask(<2 x i64> *%p, i16 %mask, <16 x i32> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_2i64_16i32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %1 = load <2 x i64>, <2 x i64> *%p + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> + %3 = bitcast <8 x i64> %2 to <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %3, <16 x i32> %passthru + ret <16 x i32> %res +} + +define <16 x i32> @test_broadcast_2i64_16i32_maskz(<2 x i64> *%p, i16 %mask) nounwind { +; CHECK-LABEL: test_broadcast_2i64_16i32_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1567,8 +1623,22 @@ define <16 x i32> @test_broadcast_2i64_16i32(<2 x i64> *%p, i16 %mask) nounwind ret <16 x i32> %res } -define <16 x float> @test_broadcast_4f64_16f32(<4 x double> *%p, i16 %mask) nounwind { -; CHECK-LABEL: test_broadcast_4f64_16f32: +define <16 x float> @test_broadcast_4f64_16f32_mask(<4 x double> *%p, i16 %mask, <16 x float> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_4f64_16f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; CHECK-NEXT: retq + %1 = load <4 x double>, <4 x double> *%p + %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> + %3 = bitcast <8 x double> %2 to <16 x float> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %3, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x float> @test_broadcast_4f64_16f32_maskz(<4 x double> *%p, i16 %mask) nounwind { +; CHECK-LABEL: test_broadcast_4f64_16f32_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] @@ -1581,8 +1651,22 @@ define <16 x float> @test_broadcast_4f64_16f32(<4 x double> *%p, i16 %mask) noun ret <16 x float> %res } -define <16 x i32> @test_broadcast_4i64_16i32(<4 x i64> *%p, i16 %mask) nounwind { -; CHECK-LABEL: test_broadcast_4i64_16i32: +define <16 x i32> @test_broadcast_4i64_16i32_mask(<4 x i64> *%p, i16 %mask, <16 x i32> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_4i64_16i32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; CHECK-NEXT: retq + %1 = load <4 x i64>, <4 x i64> *%p + %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> + %3 = bitcast <8 x i64> %2 to <16 x i32> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %3, <16 x i32> %passthru + ret <16 x i32> %res +} + +define <16 x i32> @test_broadcast_4i64_16i32_maskz(<4 x i64> *%p, i16 %mask) nounwind { +; CHECK-LABEL: test_broadcast_4i64_16i32_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] @@ -1595,8 +1679,23 @@ define <16 x i32> @test_broadcast_4i64_16i32(<4 x i64> *%p, i16 %mask) nounwind ret <16 x i32> %res } -define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_4f32_4f64: +define <4 x double> @test_broadcast_4f32_4f64_mask(<4 x float> *%p, i8 %mask, <4 x double> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_4f32_4f64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1] +; CHECK-NEXT: retq + %1 = load <4 x float>, <4 x float> *%p + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> + %3 = bitcast <8 x float> %2 to <4 x double> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x double> %3, <4 x double> %passthru + ret <4 x double> %res +} + +define <4 x double> @test_broadcast_4f32_4f64_maskz(<4 x float> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_4f32_4f64_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] @@ -1610,8 +1709,23 @@ define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwin ret <4 x double> %res } -define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_4i32_4i64: +define <4 x i64> @test_broadcast_4i32_4i64_mask(<4 x i32> *%p, i8 %mask, <4 x i64> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_4i32_4i64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1] +; CHECK-NEXT: retq + %1 = load <4 x i32>, <4 x i32> *%p + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> + %3 = bitcast <8 x i32> %2 to <4 x i64> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i64> %3, <4 x i64> %passthru + ret <4 x i64> %res +} + +define <4 x i64> @test_broadcast_4i32_4i64_maskz(<4 x i32> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_4i32_4i64_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] @@ -1625,8 +1739,22 @@ define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind { ret <4 x i64> %res } -define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_4f32_8f64: +define <8 x double> @test_broadcast_4f32_8f64_mask(<4 x float> *%p, i8 %mask, <8 x double> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_4f32_8f64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %1 = load <4 x float>, <4 x float> *%p + %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> + %3 = bitcast <16 x float> %2 to <8 x double> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x double> %3, <8 x double> %passthru + ret <8 x double> %res +} + +define <8 x double> @test_broadcast_4f32_8f64_maskz(<4 x float> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_4f32_8f64_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] @@ -1639,8 +1767,22 @@ define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwin ret <8 x double> %res } -define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_4i32_8i64: +define <8 x i64> @test_broadcast_4i32_8i64_mask(<4 x i32> *%p, i8 %mask, <8 x i64> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_4i32_8i64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + %1 = load <4 x i32>, <4 x i32> *%p + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> + %3 = bitcast <16 x i32> %2 to <8 x i64> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> %passthru + ret <8 x i64> %res +} + +define <8 x i64> @test_broadcast_4i32_8i64_maskz(<4 x i32> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_4i32_8i64_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] @@ -1653,8 +1795,22 @@ define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind { ret <8 x i64> %res } -define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_8f32_8f64: +define <8 x double> @test_broadcast_8f32_8f64_mask(<8 x float> *%p, i8 %mask, <8 x double> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_8f32_8f64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %1 = load <8 x float>, <8 x float> *%p + %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> + %3 = bitcast <16 x float> %2 to <8 x double> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x double> %3, <8 x double> %passthru + ret <8 x double> %res +} + +define <8 x double> @test_broadcast_8f32_8f64_maskz(<8 x float> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_8f32_8f64_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] @@ -1667,8 +1823,22 @@ define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwin ret <8 x double> %res } -define <8 x i64> @test_broadcast_8i32_8i64(<8 x i32> *%p, i8 %mask) nounwind { -; CHECK-LABEL: test_broadcast_8i32_8i64: +define <8 x i64> @test_broadcast_8i32_8i64_mask(<8 x i32> *%p, i8 %mask, <8 x i64> %passthru) nounwind { +; CHECK-LABEL: test_broadcast_8i32_8i64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %1 = load <8 x i32>, <8 x i32> *%p + %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> + %3 = bitcast <16 x i32> %2 to <8 x i64> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> %passthru + ret <8 x i64> %res +} + +define <8 x i64> @test_broadcast_8i32_8i64_maskz(<8 x i32> *%p, i8 %mask) nounwind { +; CHECK-LABEL: test_broadcast_8i32_8i64_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]