Revert r349731 "[CodeGen][ExpandMemcmp] Add an option for allowing overlapping loads."

Forgot to update PowerPC tests for the GEP->bitcast change. llvm-svn: 349733
2024-11-22 18:54:02 +01:00 · 2018-12-20 09:58:33 +00:00 · 2018-12-20 09:58:33 +00:00 · 64abae54ed
commit 64abae54ed
parent af5c2af0bc
6 changed files with 266 additions and 640 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -581,17 +581,13 @@ public:
  struct MemCmpExpansionOptions {
    // The list of available load sizes (in bytes), sorted in decreasing order.
    SmallVector<unsigned, 8> LoadSizes;
-    // Set to true to allow overlapping loads. For example, 7-byte compares can
-    // be done with two 4-byte compares instead of 4+2+1-byte compares. This
-    // requires all loads in LoadSizes to be doable in an unaligned way.
-    bool AllowOverlappingLoads = false;
  };
  const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;

  /// Enable matching of interleaved access groups.
  bool enableInterleavedAccessVectorization() const;

-  /// Enable matching of interleaved access groups that contain predicated
+  /// Enable matching of interleaved access groups that contain predicated 
  /// accesses or gaps and therefore vectorized using masked
  /// vector loads/stores.
  bool enableMaskedInterleavedAccessVectorization() const;
@ -776,7 +772,7 @@ public:
  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
  /// The index and subtype parameters are used by the subvector insertion and
  /// extraction shuffle kinds to show the insert/extract point and the type of
-  /// the subvector being inserted/extracted.
+  /// the subvector being inserted/extracted. 
  /// NOTE: For subvector extractions Tp represents the source type.
  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
                     Type *SubTp = nullptr) const;
--- a/lib/CodeGen/ExpandMemCmp.cpp
+++ b/lib/CodeGen/ExpandMemCmp.cpp
@ -66,18 +66,23 @@ class MemCmpExpansion {
  // Represents the decomposition in blocks of the expansion. For example,
  // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
+  // TODO(courbet): Involve the target more in this computation. On X86, 7
+  // bytes can be done more efficiently with two overlaping 4-byte loads than
+  // covering the interval with [{4, 0},{2, 4},{1, 6}}.
  struct LoadEntry {
    LoadEntry(unsigned LoadSize, uint64_t Offset)
        : LoadSize(LoadSize), Offset(Offset) {
+      assert(Offset % LoadSize == 0 && "invalid load entry");
    }

+    uint64_t getGEPIndex() const { return Offset / LoadSize; }
+
    // The size of the load for this block, in bytes.
-    unsigned LoadSize;
-    // The offset of this load from the base pointer, in bytes.
-    uint64_t Offset;
+    const unsigned LoadSize;
+    // The offset of this load WRT the base pointer, in bytes.
+    const uint64_t Offset;
  };
-  using LoadEntryVector = SmallVector<LoadEntry, 8>;
-  LoadEntryVector LoadSequence;
+  SmallVector<LoadEntry, 8> LoadSequence;

  void createLoadCmpBlocks();
  void createResultBlock();
@ -87,23 +92,13 @@ class MemCmpExpansion {
  void emitLoadCompareBlock(unsigned BlockIndex);
  void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
                                         unsigned &LoadIndex);
-  void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes);
+  void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
  void emitMemCmpResultBlock();
  Value *getMemCmpExpansionZeroCase();
  Value *getMemCmpEqZeroOneBlock();
  Value *getMemCmpOneBlock();
-  Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType,
-                                 uint64_t OffsetBytes);

-  static LoadEntryVector
-  computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
-                            unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte);
-  static LoadEntryVector
-  computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize,
-                                 unsigned MaxNumLoads,
-                                 unsigned &NumLoadsNonOneByte);
-
-public:
+ public:
  MemCmpExpansion(CallInst *CI, uint64_t Size,
                  const TargetTransformInfo::MemCmpExpansionOptions &Options,
                  unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
@ -115,76 +110,6 @@ public:
  Value *getMemCmpExpansion();
 };

-MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence(
-    uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
-    const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) {
-  NumLoadsNonOneByte = 0;
-  LoadEntryVector LoadSequence;
-  uint64_t Offset = 0;
-  while (Size && !LoadSizes.empty()) {
-    const unsigned LoadSize = LoadSizes.front();
-    const uint64_t NumLoadsForThisSize = Size / LoadSize;
-    if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
-      // Do not expand if the total number of loads is larger than what the
-      // target allows. Note that it's important that we exit before completing
-      // the expansion to avoid using a ton of memory to store the expansion for
-      // large sizes.
-      return {};
-    }
-    if (NumLoadsForThisSize > 0) {
-      for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
-        LoadSequence.push_back({LoadSize, Offset});
-        Offset += LoadSize;
-      }
-      if (LoadSize > 1)
-        ++NumLoadsNonOneByte;
-      Size = Size % LoadSize;
-    }
-    LoadSizes = LoadSizes.drop_front();
-  }
-  return LoadSequence;
-}
-
-MemCmpExpansion::LoadEntryVector
-MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
-                                                const unsigned MaxLoadSize,
-                                                const unsigned MaxNumLoads,
-                                                unsigned &NumLoadsNonOneByte) {
-  // These are already handled by the greedy approach.
-  if (Size < 2 || MaxLoadSize < 2)
-    return {};
-
-  // We try to do as many non-overlapping loads as possible starting from the
-  // beginning.
-  const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize;
-  assert(NumNonOverlappingLoads && "there must be at least one load");
-  // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with
-  // an overlapping load.
-  Size = Size - NumNonOverlappingLoads * MaxLoadSize;
-  // Bail if we do not need an overloapping store, this is already handled by
-  // the greedy approach.
-  if (Size == 0)
-    return {};
-  // Bail if the number of loads (non-overlapping + potential overlapping one)
-  // is larger than the max allowed.
-  if ((NumNonOverlappingLoads + 1) > MaxNumLoads)
-    return {};
-
-  // Add non-overlapping loads.
-  LoadEntryVector LoadSequence;
-  uint64_t Offset = 0;
-  for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) {
-    LoadSequence.push_back({MaxLoadSize, Offset});
-    Offset += MaxLoadSize;
-  }
-
-  // Add the last overlapping load.
-  assert(Size > 0 && Size < MaxLoadSize && "broken invariant");
-  LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)});
-  NumLoadsNonOneByte = 1;
-  return LoadSequence;
-}
-
 // Initialize the basic block structure required for expansion of memcmp call
 // with given maximum load size and memcmp size parameter.
 // This structure includes:
@ -208,31 +133,38 @@ MemCmpExpansion::MemCmpExpansion(
      Builder(CI) {
  assert(Size > 0 && "zero blocks");
  // Scale the max size down if the target can load more bytes than we need.
-  llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes);
-  while (!LoadSizes.empty() && LoadSizes.front() > Size) {
-    LoadSizes = LoadSizes.drop_front();
+  size_t LoadSizeIndex = 0;
+  while (LoadSizeIndex < Options.LoadSizes.size() &&
+         Options.LoadSizes[LoadSizeIndex] > Size) {
+    ++LoadSizeIndex;
  }
-  assert(!LoadSizes.empty() && "cannot load Size bytes");
-  MaxLoadSize = LoadSizes.front();
+  this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
  // Compute the decomposition.
-  unsigned GreedyNumLoadsNonOneByte = 0;
-  LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads,
-                                           GreedyNumLoadsNonOneByte);
-  NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
-  assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
-  // If we allow overlapping loads and the load sequence is not already optimal,
-  // use overlapping loads.
-  if (Options.AllowOverlappingLoads &&
-      (LoadSequence.empty() || LoadSequence.size() > 2)) {
-    unsigned OverlappingNumLoadsNonOneByte = 0;
-    auto OverlappingLoads = computeOverlappingLoadSequence(
-        Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte);
-    if (!OverlappingLoads.empty() &&
-        (LoadSequence.empty() ||
-         OverlappingLoads.size() < LoadSequence.size())) {
-      LoadSequence = OverlappingLoads;
-      NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
+  uint64_t CurSize = Size;
+  uint64_t Offset = 0;
+  while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
+    const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
+    assert(LoadSize > 0 && "zero load size");
+    const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
+    if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
+      // Do not expand if the total number of loads is larger than what the
+      // target allows. Note that it's important that we exit before completing
+      // the expansion to avoid using a ton of memory to store the expansion for
+      // large sizes.
+      LoadSequence.clear();
+      return;
    }
+    if (NumLoadsForThisSize > 0) {
+      for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
+        LoadSequence.push_back({LoadSize, Offset});
+        Offset += LoadSize;
+      }
+      if (LoadSize > 1) {
+        ++NumLoadsNonOneByte;
+      }
+      CurSize = CurSize % LoadSize;
+    }
+    ++LoadSizeIndex;
  }
  assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
 }
@ -257,32 +189,30 @@ void MemCmpExpansion::createResultBlock() {
                                   EndBlock->getParent(), EndBlock);
 }

-/// Return a pointer to an element of type `LoadSizeType` at offset
-/// `OffsetBytes`.
-Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source,
-                                                Type *LoadSizeType,
-                                                uint64_t OffsetBytes) {
-  if (OffsetBytes > 0) {
-    auto *ByteType = Type::getInt8Ty(CI->getContext());
-    Source = Builder.CreateGEP(
-        ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()),
-        ConstantInt::get(ByteType, OffsetBytes));
-  }
-  return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo());
-}
-
 // This function creates the IR instructions for loading and comparing 1 byte.
 // It loads 1 byte from each source of the memcmp parameters with the given
 // GEPIndex. It then subtracts the two loaded values and adds this result to the
 // final phi node for selecting the memcmp result.
 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
-                                               unsigned OffsetBytes) {
+                                               unsigned GEPIndex) {
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
-  Value *Source1 =
-      getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes);
-  Value *Source2 =
-      getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes);
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex.
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }

  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
@ -340,10 +270,24 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
    IntegerType *LoadSizeType =
        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);

-    Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
-                                             CurLoadEntry.Offset);
-    Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
-                                             CurLoadEntry.Offset);
+    Value *Source1 = CI->getArgOperand(0);
+    Value *Source2 = CI->getArgOperand(1);
+
+    // Cast source to LoadSizeType*.
+    if (Source1->getType() != LoadSizeType)
+      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+    if (Source2->getType() != LoadSizeType)
+      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+    // Get the base address using a GEP.
+    if (CurLoadEntry.Offset != 0) {
+      Source1 = Builder.CreateGEP(
+          LoadSizeType, Source1,
+          ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+      Source2 = Builder.CreateGEP(
+          LoadSizeType, Source2,
+          ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+    }

    // Get a constant or load a value for each source address.
    Value *LoadSrc1 = nullptr;
@ -434,7 +378,8 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
  const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];

  if (CurLoadEntry.LoadSize == 1) {
-    MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset);
+    MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
+                                              CurLoadEntry.getGEPIndex());
    return;
  }

@ -443,12 +388,25 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
  assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");

-  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);

-  Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
-                                           CurLoadEntry.Offset);
-  Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
-                                           CurLoadEntry.Offset);
+  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using a GEP.
+  if (CurLoadEntry.Offset != 0) {
+    Source1 = Builder.CreateGEP(
+        LoadSizeType, Source1,
+        ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+    Source2 = Builder.CreateGEP(
+        LoadSizeType, Source2,
+        ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+  }

  // Load LoadSizeType from the base address.
  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
@ -736,6 +694,7 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
  if (SizeVal == 0) {
    return false;
  }
+
  // TTI call to check if target would like to expand memcmp. Also, get the
  // available load sizes.
  const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -1886,7 +1886,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
  };
  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
-    { ISD::BITREVERSE, MVT::i64,    14 }
+    { ISD::BITREVERSE, MVT::i64,    14 } 
  };
  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
    { ISD::BITREVERSE, MVT::i32,    14 },
@ -2899,9 +2899,6 @@ X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
    Options.LoadSizes.push_back(4);
    Options.LoadSizes.push_back(2);
    Options.LoadSizes.push_back(1);
-    // All GPR and vector loads can be unaligned. SIMD compare requires integer
-    // vectors (SSE2/AVX2).
-    Options.AllowOverlappingLoads = true;
    return Options;
  }();
  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
--- a/test/CodeGen/X86/memcmp-optsize.ll
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@ -639,33 +639,17 @@ define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
 }

 define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length24_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: length24_eq:
 ; X64-SSE2:       # %bb.0:
@ -699,30 +683,17 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
 }

 define i1 @length24_eq_const(i8* %X) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length24_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: length24_eq_const:
 ; X64-SSE2:       # %bb.0:
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@ -362,24 +362,24 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7_eq:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $7
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length7_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    xorl 3(%rsi), %ecx
-; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $7, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
  %c = icmp ne i32 %m, 0
@ -548,12 +548,12 @@ define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length11_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 3(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 3(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $11, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind
  %c = icmp eq i32 %m, 0
@ -640,12 +640,12 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length13_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 5(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 5(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $13, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind
  %c = icmp eq i32 %m, 0
@ -667,12 +667,12 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length14_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 6(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 6(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $14, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind
  %c = icmp eq i32 %m, 0
@ -694,12 +694,12 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length15_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 7(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $15, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
  %c = icmp eq i32 %m, 0
@ -885,45 +885,17 @@ define i32 @length24(i8* %X, i8* %Y) nounwind {
 }

 define i1 @length24_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length24_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: length24_eq:
 ; X64-SSE2:       # %bb.0:
@ -957,42 +929,17 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
 }

 define i1 @length24_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length24_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: length24_eq_const:
 ; X64-SSE2:       # %bb.0:
--- a/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@ -130,11 +130,11 @@ define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16*
-; ALL-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16*
-; ALL-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP11]]
+; ALL-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
+; ALL-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
+; ALL-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
+; ALL-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
 ; ALL-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
 ; ALL-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
 ; ALL-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
@ -178,11 +178,11 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
 ; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; X32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32*
-; X32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32*
-; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
 ; X32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
 ; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
 ; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
@ -272,11 +272,11 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
 ; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16*
-; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16*
-; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
 ; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
 ; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
 ; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
@ -324,11 +324,11 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
 ; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32*
-; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32*
-; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
 ; X64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
 ; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
 ; X64-NEXT:    [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
@ -394,11 +394,11 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
 ; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i64*
-; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i64*
-; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i64*
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
 ; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
 ; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
@ -597,11 +597,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2
+; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X32-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
 ; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
@ -625,11 +625,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
+; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; X64_1LD-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@ -645,11 +645,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
+; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; X64_2LD-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
@ -668,71 +668,11 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }

 define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq7(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3
-; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
-; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
-; X64_1LD-LABEL: @cmp_eq7(
-; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
-; X64_1LD:       res_block:
-; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
-; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X64_1LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; X64_1LD:       endblock:
-; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_1LD-NEXT:    ret i32 [[CONV]]
-;
-; X64_2LD-LABEL: @cmp_eq7(
-; X64_2LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X64_2LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_2LD-NEXT:    ret i32 [[CONV]]
+; ALL-LABEL: @cmp_eq7(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
 ;
  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
  %cmp = icmp eq i32 %call, 0
@ -747,11 +687,11 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
 ; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
@ -854,11 +794,11 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
+; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
+; X64_1LD-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@ -874,11 +814,11 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
+; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
+; X64_2LD-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
@ -897,57 +837,11 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }

 define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
-; X64_1LD-LABEL: @cmp_eq11(
-; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
-; X64_1LD:       res_block:
-; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
-; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; X64_1LD:       endblock:
-; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_1LD-NEXT:    ret i32 [[CONV]]
-;
-; X64_2LD-LABEL: @cmp_eq11(
-; X64_2LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_2LD-NEXT:    ret i32 [[CONV]]
+; ALL-LABEL: @cmp_eq11(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
 ;
  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
  %cmp = icmp eq i32 %call, 0
@ -974,11 +868,11 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
+; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64_1LD-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@ -994,11 +888,11 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
+; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64_2LD-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
@ -1017,57 +911,11 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }

 define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
-; X64_1LD-LABEL: @cmp_eq13(
-; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
-; X64_1LD:       res_block:
-; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
-; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 5
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 5
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; X64_1LD:       endblock:
-; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_1LD-NEXT:    ret i32 [[CONV]]
-;
-; X64_2LD-LABEL: @cmp_eq13(
-; X64_2LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 5
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 5
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_2LD-NEXT:    ret i32 [[CONV]]
+; ALL-LABEL: @cmp_eq13(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
 ;
  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
  %cmp = icmp eq i32 %call, 0
@ -1076,57 +924,11 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }

 define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
-; X64_1LD-LABEL: @cmp_eq14(
-; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
-; X64_1LD:       res_block:
-; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
-; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 6
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 6
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; X64_1LD:       endblock:
-; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_1LD-NEXT:    ret i32 [[CONV]]
-;
-; X64_2LD-LABEL: @cmp_eq14(
-; X64_2LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 6
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 6
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_2LD-NEXT:    ret i32 [[CONV]]
+; ALL-LABEL: @cmp_eq14(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
 ;
  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
  %cmp = icmp eq i32 %call, 0
@ -1135,57 +937,11 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }

 define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
-; X64_1LD-LABEL: @cmp_eq15(
-; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
-; X64_1LD:       res_block:
-; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
-; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 7
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 7
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; X64_1LD:       endblock:
-; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_1LD-NEXT:    ret i32 [[CONV]]
-;
-; X64_2LD-LABEL: @cmp_eq15(
-; X64_2LD-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 7
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 7
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X64_2LD-NEXT:    ret i32 [[CONV]]
+; ALL-LABEL: @cmp_eq15(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
 ;
  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
  %cmp = icmp eq i32 %call, 0