InstCombine now optimizes vector udiv by power of 2 to shifts

Fixes r8429 llvm-svn: 144036
2024-10-20 03:23:01 +02:00 · 2011-11-07 23:04:49 +00:00 · 2011-11-07 23:04:49 +00:00 · 1d5d364e06
commit 1d5d364e06
parent c1bb1b2b09
2 changed files with 24 additions and 5 deletions
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@ -442,18 +442,22 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
  if (Instruction *Common = commonIDivTransforms(I))
    return Common;
  
-  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+  { 
    // X udiv 2^C -> X >> C
    // Check to see if this is an unsigned division with an exact power of 2,
    // if so, convert to a right shift.
-    if (C->getValue().isPowerOf2()) { // 0 not included in isPowerOf2
+    const APInt *C;
+    if (match(Op1, m_Power2(C))) {
      BinaryOperator *LShr =
-        BinaryOperator::CreateLShr(Op0, 
-            ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
+      BinaryOperator::CreateLShr(Op0, 
+                                 ConstantInt::get(Op0->getType(), 
+                                                  C->logBase2()));
      if (I.isExact()) LShr->setIsExact();
      return LShr;
    }
+  }

+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
    // X udiv C, where C >= signbit
    if (C->getValue().isNegative()) {
      Value *IC = Builder->CreateICmpULT(Op0, C);
--- a/test/CodeGen/X86/vec_udiv_to_shift.ll
+++ b/test/CodeGen/X86/vec_udiv_to_shift.ll
@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define <8 x i16> @udiv_vec8x16(<8 x i16> %var) {
+entry:
+; CHECK: lshr <8 x i16> %var, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+%0 = udiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+ret <8 x i16> %0
+}
+
+define <4 x i32> @udiv_vec4x32(<4 x i32> %var) {
+entry:
+; CHECK: lshr <4 x i32> %var, <i32 4, i32 4, i32 4, i32 4>
+%0 = udiv <4 x i32> %var, <i32 16, i32 16, i32 16, i32 16>
+ret <4 x i32> %0
+}