diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 31eb6aed31c..adc433c4b70 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -768,6 +768,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // TODO: Could compute known zero/one bits based on the input. break; } + case Intrinsic::x86_mmx_pmovmskb: case Intrinsic::x86_sse_movmsk_ps: case Intrinsic::x86_sse2_movmsk_pd: case Intrinsic::x86_sse2_pmovmskb_128: @@ -776,9 +777,14 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, case Intrinsic::x86_avx2_pmovmskb: { // MOVMSK copies the vector elements' sign bits to the low bits // and zeros the high bits. - auto Arg = II->getArgOperand(0); - auto ArgType = cast(Arg->getType()); - unsigned ArgWidth = ArgType->getNumElements(); + unsigned ArgWidth; + if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { + ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. + } else { + auto Arg = II->getArgOperand(0); + auto ArgType = cast(Arg->getType()); + ArgWidth = ArgType->getNumElements(); + } // If we don't need any of low bits then return zero, // we know that DemandedMask is non-zero already. diff --git a/test/Transforms/InstCombine/x86-movmsk.ll b/test/Transforms/InstCombine/x86-movmsk.ll index 767899432b0..3b644cba8a2 100644 --- a/test/Transforms/InstCombine/x86-movmsk.ll +++ b/test/Transforms/InstCombine/x86-movmsk.ll @@ -7,6 +7,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; DemandedBits - MOVMSK zeros the upper bits of the result. ; +define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) { +; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0) +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0) + %2 = and i32 %1, 255 + ret i32 %2 +} + define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) { ; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) @@ -63,6 +73,15 @@ define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) { ; DemandedBits - If we don't use the lower bits then we just return zero. ; +define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) { +; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb( +; CHECK-NEXT: ret i32 0 +; + %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0) + %2 = and i32 %1, -256 + ret i32 %2 +} + define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) { ; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps( ; CHECK-NEXT: ret i32 0 @@ -110,6 +129,7 @@ define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) { ; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register. +declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)