1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 18:42:46 +02:00

[X86][InstCombine] Handle scalar fmadd intrinsics correctly in SimplifyDemandedVectorElts.

Now we pass a modified version of DemandedElts to each operand and we calculate undef elts correctly.

llvm-svn: 289632
This commit is contained in:
Craig Topper 2016-12-14 05:43:05 +00:00
parent 1c61c19903
commit b9bbf2793c
2 changed files with 22 additions and 15 deletions

View File

@ -1754,14 +1754,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
case Intrinsic::x86_fma_vfmadd_ss:
case Intrinsic::x86_fma_vfmsub_ss:
case Intrinsic::x86_fma_vfnmadd_ss:
case Intrinsic::x86_fma_vfnmsub_ss:
case Intrinsic::x86_fma_vfmadd_sd:
case Intrinsic::x86_fma_vfmsub_sd:
case Intrinsic::x86_fma_vfnmadd_sd:
case Intrinsic::x86_fma_vfnmsub_sd:
case Intrinsic::x86_avx512_mask_add_ss_round:
case Intrinsic::x86_avx512_mask_div_ss_round:
case Intrinsic::x86_avx512_mask_mul_ss_round:
@ -1793,6 +1785,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
case Intrinsic::x86_fma_vfmadd_ss:
case Intrinsic::x86_fma_vfmsub_ss:
case Intrinsic::x86_fma_vfnmadd_ss:
case Intrinsic::x86_fma_vfnmsub_ss:
case Intrinsic::x86_fma_vfmadd_sd:
case Intrinsic::x86_fma_vfmsub_sd:
case Intrinsic::x86_fma_vfnmadd_sd:
case Intrinsic::x86_fma_vfnmsub_sd:
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:

View File

@ -1349,6 +1349,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
}
// Three input scalar-as-vector operations that work column-wise. The high
// elements come from operand 0 and the low element is a function of all
// three inputs.
case Intrinsic::x86_fma_vfmadd_ss:
case Intrinsic::x86_fma_vfmsub_ss:
case Intrinsic::x86_fma_vfnmadd_ss:
@ -1360,6 +1363,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
// If lowest element of a scalar op isn't used then use Arg0.
if (!DemandedElts[0])
return II->getArgOperand(0);
// Only lower element is used for operand 1 and 2.
DemandedElts = 1;
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
UndefElts2, Depth + 1);
if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
@ -1367,14 +1377,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
UndefElts3, Depth + 1);
if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
// If lowest element of a scalar op isn't used then use Arg0.
if (DemandedElts.getLoBits(1) != 1)
return II->getArgOperand(0);
// Lower element is undefined if all three lower elements are undefined.
// Consider things like undef&0. The result is known zero, not undef.
if (!UndefElts2[0] || !UndefElts3[0])
UndefElts.clearBit(0);
// Output elements are undefined if all three are undefined. Consider
// things like undef&0. The result is known zero, not undef.
UndefElts &= UndefElts2;
UndefElts &= UndefElts3;
break;
// SSE4A instructions leave the upper 64-bits of the 128-bit result