Fixed the DAG combiner to better handle the folding of AND nodes for vector types. The previous code was making the assumption that the length of the bitmask returned by isConstantSplat was equal to the size of the vector type. Now we first make sure that the splat value has at least the length of the vector lane type, then we only use as many fields as we have available in the splat value.

llvm-svn: 163203
2025-02-01 05:01:59 +01:00 · 2012-09-05 08:57:21 +00:00 · 2012-09-05 08:57:21 +00:00 · 6f46bb1705
commit 6f46bb1705
parent a56cf96db5
2 changed files with 22 additions and 1 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -2496,8 +2496,18 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
        // lanes of the constant together.
        EVT VT = Vector->getValueType(0);
        unsigned BitWidth = VT.getVectorElementType().getSizeInBits();
+
+        // If the splat value has been compressed to a bitlength lower
+        // than the size of the vector lane, we need to re-expand it to
+        // the lane size.
+        if (BitWidth > SplatBitSize)
+          for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
+               SplatBitSize < BitWidth;
+               SplatBitSize = SplatBitSize * 2)
+            SplatValue |= SplatValue.shl(SplatBitSize);
+
        Constant = APInt::getAllOnesValue(BitWidth);
-        for (unsigned i = 0, n = VT.getVectorNumElements(); i < n; ++i)
+        for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
          Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
      }
    }
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@ -62,3 +62,14 @@ define <4 x i8> @i(<4 x i8>* %x) {
  %2 = sdiv <4 x i8> zeroinitializer, %1
  ret <4 x i8> %2
 }
+; CHECK: j:
+define <4 x i32> @j(<4 x i8>* %in) nounwind {
+  ; CHECK: vld1
+  ; CHECK: vmovl.u8
+  ; CHECK: vmovl.u16
+  ; CHECK-NOT: vand
+  %1 = load <4 x i8>* %in, align 4
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+