diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 59540211d54..15d4bde0167 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1267,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { - setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } @@ -1416,10 +1416,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. - for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, - MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { - setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); + if (!Subtarget.hasVLX()) { + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); + } } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index e5ff6014edc..3d39f96adb5 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -706,3 +706,16 @@ define void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vect store <64 x i8> %f, <64 x i8>* %c ret void } + +; This threw an assertion at one point. +define <4 x i32> @mload_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) "min-legal-vector-width"="256" { +; CHECK-LABEL: mload_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; CHECK-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)