[X86][InstCombine] Handle scalar fmadd intrinsics correctly in SimplifyDemandedVectorElts.

Now we pass a modified version of DemandedElts to each operand and we calculate undef elts correctly. llvm-svn: 289632
2024-10-18 18:42:46 +02:00 · 2016-12-14 05:43:05 +00:00 · 2016-12-14 05:43:05 +00:00 · b9bbf2793c
commit b9bbf2793c
parent 1c61c19903
2 changed files with 22 additions and 15 deletions
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -1754,14 +1754,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    break;
  }

-  case Intrinsic::x86_fma_vfmadd_ss:
-  case Intrinsic::x86_fma_vfmsub_ss:
-  case Intrinsic::x86_fma_vfnmadd_ss:
-  case Intrinsic::x86_fma_vfnmsub_ss:
-  case Intrinsic::x86_fma_vfmadd_sd:
-  case Intrinsic::x86_fma_vfmsub_sd:
-  case Intrinsic::x86_fma_vfnmadd_sd:
-  case Intrinsic::x86_fma_vfnmsub_sd:
  case Intrinsic::x86_avx512_mask_add_ss_round:
  case Intrinsic::x86_avx512_mask_div_ss_round:
  case Intrinsic::x86_avx512_mask_mul_ss_round:
@ -1793,6 +1785,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    break;
  }

+  case Intrinsic::x86_fma_vfmadd_ss:
+  case Intrinsic::x86_fma_vfmsub_ss:
+  case Intrinsic::x86_fma_vfnmadd_ss:
+  case Intrinsic::x86_fma_vfnmsub_ss:
+  case Intrinsic::x86_fma_vfmadd_sd:
+  case Intrinsic::x86_fma_vfmsub_sd:
+  case Intrinsic::x86_fma_vfnmadd_sd:
+  case Intrinsic::x86_fma_vfnmsub_sd:
  case Intrinsic::x86_sse_cmp_ss:
  case Intrinsic::x86_sse_min_ss:
  case Intrinsic::x86_sse_max_ss:
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@ -1349,6 +1349,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
      break;
    }

+    // Three input scalar-as-vector operations that work column-wise. The high
+    // elements come from operand 0 and the low element is a function of all
+    // three inputs.
    case Intrinsic::x86_fma_vfmadd_ss:
    case Intrinsic::x86_fma_vfmsub_ss:
    case Intrinsic::x86_fma_vfnmadd_ss:
@ -1360,6 +1363,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                        UndefElts, Depth + 1);
      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (!DemandedElts[0])
+        return II->getArgOperand(0);
+
+      // Only lower element is used for operand 1 and 2.
+      DemandedElts = 1;
      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
                                        UndefElts2, Depth + 1);
      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
@ -1367,14 +1377,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                        UndefElts3, Depth + 1);
      if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }

-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (DemandedElts.getLoBits(1) != 1)
-        return II->getArgOperand(0);
+      // Lower element is undefined if all three lower elements are undefined.
+      // Consider things like undef&0.  The result is known zero, not undef.
+      if (!UndefElts2[0] || !UndefElts3[0])
+        UndefElts.clearBit(0);

-      // Output elements are undefined if all three are undefined.  Consider
-      // things like undef&0.  The result is known zero, not undef.
-      UndefElts &= UndefElts2;
-      UndefElts &= UndefElts3;
      break;

    // SSE4A instructions leave the upper 64-bits of the 128-bit result