Merge clang's isRepeatedBytePattern with LLVM's isBytewiseValue

Summary: his code was in CGDecl.cpp and really belongs in LLVM's isBytewiseValue. Teach isBytewiseValue the tricks clang's isRepeatedBytePattern had, including merging undef properly, and recursing on more types. clang part of this patch: D51752 Subscribers: dexonsmith, llvm-commits Differential Revision: https://reviews.llvm.org/D51751 llvm-svn: 342709
2025-01-31 12:41:49 +01:00 · 2018-09-21 05:17:42 +00:00 · 2018-09-21 05:17:42 +00:00 · 4bd103b312
commit 4bd103b312
parent 9d6986a7f5
6 changed files with 185 additions and 55 deletions
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@ -221,7 +221,8 @@ class Value;
  /// return the i8 value that it is represented with. This is true for all i8
  /// values obviously, but is also true for i32 0, i32 -1, i16 0xF0F0, double
  /// 0.0 etc. If the value can't be handled with a repeated byte store (e.g.
-  /// i16 0x1234), return null.
+  /// i16 0x1234), return null. If the value is entirely undef and padding,
+  /// return undef.
  Value *isBytewiseValue(Value *V);

  /// Given an aggregrate and an sequence of indices, see if the scalar value
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@ -3042,62 +3042,92 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
  return true;
 }

-/// If the specified value can be set by repeating the same byte in memory,
-/// return the i8 value that it is represented with.  This is
-/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
-/// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
-/// byte store (e.g. i16 0x1234), return null.
 Value *llvm::isBytewiseValue(Value *V) {
+
  // All byte-wide stores are splatable, even of arbitrary variables.
-  if (V->getType()->isIntegerTy(8)) return V;
+  if (V->getType()->isIntegerTy(8))
+    return V;
+
+  LLVMContext &Ctx = V->getContext();
+
+  // Undef don't care.
+  auto *UndefInt8 = UndefValue::get(Type::getInt8Ty(Ctx));
+  if (isa<UndefValue>(V))
+    return UndefInt8;
+
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C) {
+    // Conceptually, we could handle things like:
+    //   %a = zext i8 %X to i16
+    //   %b = shl i16 %a, 8
+    //   %c = or i16 %a, %b
+    // but until there is an example that actually needs this, it doesn't seem
+    // worth worrying about.
+    return nullptr;
+  }

  // Handle 'null' ConstantArrayZero etc.
-  if (Constant *C = dyn_cast<Constant>(V))
-    if (C->isNullValue())
-      return Constant::getNullValue(Type::getInt8Ty(V->getContext()));
+  if (C->isNullValue())
+    return Constant::getNullValue(Type::getInt8Ty(Ctx));

-  // Constant float and double values can be handled as integer values if the
+  // Constant floating-point values can be handled as integer values if the
  // corresponding integer value is "byteable".  An important case is 0.0.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
-    if (CFP->getType()->isFloatTy())
-      V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
-    if (CFP->getType()->isDoubleTy())
-      V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    Type *Ty = nullptr;
+    if (CFP->getType()->isHalfTy())
+      Ty = Type::getInt16Ty(Ctx);
+    else if (CFP->getType()->isFloatTy())
+      Ty = Type::getInt32Ty(Ctx);
+    else if (CFP->getType()->isDoubleTy())
+      Ty = Type::getInt64Ty(Ctx);
    // Don't handle long double formats, which have strange constraints.
+    return Ty ? isBytewiseValue(ConstantExpr::getBitCast(CFP, Ty)) : nullptr;
  }

  // We can handle constant integers that are multiple of 8 bits.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
    if (CI->getBitWidth() % 8 == 0) {
      assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");
-
      if (!CI->getValue().isSplat(8))
        return nullptr;
-      return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
+      return ConstantInt::get(Ctx, CI->getValue().trunc(8));
    }
  }

-  // A ConstantDataArray/Vector is splatable if all its members are equal and
-  // also splatable.
-  if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
-    Value *Elt = CA->getElementAsConstant(0);
-    Value *Val = isBytewiseValue(Elt);
-    if (!Val)
+  auto Merge = [&](Value *LHS, Value *RHS) -> Value * {
+    if (LHS == RHS)
+      return LHS;
+    if (!LHS || !RHS)
      return nullptr;
+    if (LHS == UndefInt8)
+      return RHS;
+    if (RHS == UndefInt8)
+      return LHS;
+    return nullptr;
+  };

-    for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
-      if (CA->getElementAsConstant(I) != Elt)
+  if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(C)) {
+    Value *Val = UndefInt8;
+    for (unsigned I = 0, E = CA->getNumElements(); I != E; ++I)
+      if (!(Val = Merge(Val, isBytewiseValue(CA->getElementAsConstant(I)))))
        return nullptr;
-
    return Val;
  }

-  // Conceptually, we could handle things like:
-  //   %a = zext i8 %X to i16
-  //   %b = shl i16 %a, 8
-  //   %c = or i16 %a, %b
-  // but until there is an example that actually needs this, it doesn't seem
-  // worth worrying about.
+  if (isa<ConstantVector>(C)) {
+    Constant *Splat = cast<ConstantVector>(C)->getSplatValue();
+    return Splat ? isBytewiseValue(Splat) : nullptr;
+  }
+
+  if (isa<ConstantArray>(C) || isa<ConstantStruct>(C)) {
+    Value *Val = UndefInt8;
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I)
+      if (!(Val = Merge(Val, isBytewiseValue(C->getOperand(I)))))
+        return nullptr;
+    return Val;
+  }
+
+  // Don't try to handle the handful of other constants.
  return nullptr;
 }

--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@ -348,6 +348,9 @@ static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
 /// just replicate their input array and then pass on to memset_pattern16.
 static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+  // FIXME: This could check for UndefValue because it can be merged into any
+  // other valid pattern.
+
  // If the value isn't a constant, we can't promote it to being in a constant
  // array.  We could theoretically do a store to an alloca or something, but
  // that doesn't seem worthwhile.
@ -645,9 +648,13 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,

      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
        if (For == ForMemset::Yes) {
+          if (isa<UndefValue>(FirstSplatValue))
+            FirstSplatValue = SecondSplatValue;
          if (FirstSplatValue != SecondSplatValue)
            continue;
        } else {
+          if (isa<UndefValue>(FirstPatternValue))
+            FirstPatternValue = SecondPatternValue;
          if (FirstPatternValue != SecondPatternValue)
            continue;
        }
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@ -413,7 +413,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
      if (!NextStore->isSimple()) break;

      // Check to see if this stored value is of the same byte-splattable value.
-      if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+      Value *StoredByte = isBytewiseValue(NextStore->getOperand(0));
+      if (isa<UndefValue>(ByteVal) && StoredByte)
+        ByteVal = StoredByte;
+      if (ByteVal != StoredByte)
        break;

      // Check to see if this store is to a constant offset from the start ptr.
--- a/test/Transforms/MemCpyOpt/fca2memcpy.ll
+++ b/test/Transforms/MemCpyOpt/fca2memcpy.ll
@ -73,13 +73,16 @@ define void @copyalias(%S* %src, %S* %dst) {
  ret void
 }

-; If the store address is computed ina complex manner, make
+; If the store address is computed in a complex manner, make
 ; sure we lift the computation as well if needed and possible.
 define void @addrproducer(%S* %src, %S* %dst) {
-; CHECK-LABEL: addrproducer
-; CHECK: %dst2 = getelementptr %S, %S* %dst, i64 1
-; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
-; CHECK-NEXT: store %S undef, %S* %dst
+; CHECK-LABEL: addrproducer(
+; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8*
+; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i64 1
+; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8*
+; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8*
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false)
 ; CHECK-NEXT: ret void
  %1 = load %S, %S* %src
  store %S undef, %S* %dst
@ -89,7 +92,14 @@ define void @addrproducer(%S* %src, %S* %dst) {
 }

 define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) {
-; CHECK-LABEL: aliasaddrproducer
+; CHECK-LABEL: aliasaddrproducer(
+; CHECK-NEXT: %[[SRC:[0-9]+]] = load %S, %S* %src
+; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8*
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT: %dstindex = load i32, i32* %dstidptr
+; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex
+; CHECK-NEXT: store %S %[[SRC]], %S* %dst2
+; CHECK-NEXT: ret void
  %1 = load %S, %S* %src
  store %S undef, %S* %dst
  %dstindex = load i32, i32* %dstidptr
@ -99,7 +109,16 @@ define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) {
 }

 define void @noaliasaddrproducer(%S* %src, %S* noalias %dst, i32* noalias %dstidptr) {
-; CHECK-LABEL: noaliasaddrproducer
+; CHECK-LABEL: noaliasaddrproducer(
+; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8*
+; CHECK-NEXT: %[[LOADED:[0-9]+]] = load i32, i32* %dstidptr
+; CHECK-NEXT: %dstindex = or i32 %[[LOADED]], 1
+; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex
+; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8*
+; CHECK-NEXT: %[[SRCCAST2:[0-9]+]] = bitcast %S* %src to i8*
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST2]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SRCCAST]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT: ret void
  %1 = load %S, %S* %src
  store %S undef, %S* %src
  %2 = load i32, i32* %dstidptr
--- a/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
+++ b/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
@ -1,19 +1,89 @@
 ; RUN: opt -memcpyopt -S < %s | FileCheck %s

-@cst = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4
-
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
-declare void @foo(i32*) nounwind

-define void @test1() nounwind {
-  %arr = alloca [3 x i32], align 4
-  %arr_i8 = bitcast [3 x i32]* %arr to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %arr_i8, i8* align 4 bitcast ([3 x i32]* @cst to i8*), i64 12, i1 false)
-  %arraydecay = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i64 0, i64 0
-  call void @foo(i32* %arraydecay) nounwind
+@undef = internal constant i32 undef, align 4
+define void @test_undef() nounwind {
+  %a = alloca i32, align 4
+  %i8 = bitcast i32* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (i32* @undef to i8*), i64 4, i1 false)
  ret void
-; CHECK-LABEL: @test1(
-; CHECK: call void @llvm.memset
-; CHECK-NOT: call void @llvm.memcpy
-; CHECK: ret void
+; CHECK-LABEL: @test_undef(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@i32x3 = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4
+define void @test_i32x3() nounwind {
+  %a = alloca [3 x i32], align 4
+  %i8 = bitcast [3 x i32]* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3 to i8*), i64 12, i1 false)
+  ret void
+; CHECK-LABEL: @test_i32x3(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@i32x3_undef = internal constant [3 x i32] [i32 -1, i32 undef, i32 -1], align 4
+define void @test_i32x3_undef() nounwind {
+  %a = alloca [3 x i32], align 4
+  %i8 = bitcast [3 x i32]* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3_undef to i8*), i64 12, i1 false)
+  ret void
+; CHECK-LABEL: @test_i32x3_undef(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+%struct.bitfield = type { i8, [3 x i8] }
+@bitfield = private unnamed_addr constant %struct.bitfield { i8 -86, [3 x i8] [i8 -86, i8 -86, i8 -86] }, align 4
+define void @test_bitfield() nounwind {
+  %a = alloca %struct.bitfield, align 4
+  %i8 = bitcast %struct.bitfield* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (%struct.bitfield* @bitfield to i8*), i64 4, i1 false)
+  ret void
+; CHECK-LABEL: @test_bitfield(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@i1x16_zero = internal constant <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, align 4
+define void @test_i1x16_zero() nounwind {
+  %a = alloca <16 x i1>, align 4
+  %i8 = bitcast <16 x i1>* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_zero to i8*), i64 16, i1 false)
+  ret void
+; CHECK-LABEL: @test_i1x16_zero(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+; i1 isn't currently handled. Should it?
+@i1x16_one = internal constant <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, align 4
+define void @test_i1x16_one() nounwind {
+  %a = alloca <16 x i1>, align 4
+  %i8 = bitcast <16 x i1>* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_one to i8*), i64 16, i1 false)
+  ret void
+; CHECK-LABEL: @test_i1x16_one(
+; CHECK-NOT:   call void @llvm.memset
+; CHECK:      call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@half = internal constant half 0xH0000, align 4
+define void @test_half() nounwind {
+  %a = alloca half, align 4
+  %i8 = bitcast half* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (half* @half to i8*), i64 2, i1 false)
+  ret void
+; CHECK-LABEL: @test_half(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
 }