Revert 279703, it caused PR31404.

llvm-svn: 289923
2025-01-31 20:51:52 +01:00 · 2016-12-16 04:51:25 +00:00 · 2016-12-16 04:51:25 +00:00 · b553ce3ab5
commit b553ce3ab5
parent 2345112c5b
5 changed files with 25 additions and 255 deletions
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@ -222,11 +222,10 @@ private:
                       const uint16_t *QOpcodes);

  /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
-  /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
+  /// should be 2, 3 or 4.  The opcode array specifies the instructions used
  /// for loading D registers.  (Q registers are not supported.)
  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes = nullptr);
+                    const uint16_t *Opcodes);

  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
@ -1763,12 +1762,6 @@ static bool isVLDfixed(unsigned Opc)
  case ARM::VLD1q16wb_fixed : return true;
  case ARM::VLD1q32wb_fixed : return true;
  case ARM::VLD1q64wb_fixed : return true;
-  case ARM::VLD1DUPd8wb_fixed : return true;
-  case ARM::VLD1DUPd16wb_fixed : return true;
-  case ARM::VLD1DUPd32wb_fixed : return true;
-  case ARM::VLD1DUPq8wb_fixed : return true;
-  case ARM::VLD1DUPq16wb_fixed : return true;
-  case ARM::VLD1DUPq32wb_fixed : return true;
  case ARM::VLD2d8wb_fixed : return true;
  case ARM::VLD2d16wb_fixed : return true;
  case ARM::VLD2d32wb_fixed : return true;
@ -1823,12 +1816,6 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
  case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
  case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
  case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
-  case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register;
-  case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register;
-  case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register;
-  case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
-  case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
-  case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;

  case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
  case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
@ -2269,9 +2256,8 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
 }

 void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+                                   const uint16_t *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
  SDLoc dl(N);

  SDValue MemAddr, Align;
@ -2299,21 +2285,19 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
  }
  Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);

-  unsigned Opc;
+  unsigned OpcodeIndex;
  switch (VT.getSimpleVT().SimpleTy) {
  default: llvm_unreachable("unhandled vld-dup type");
-  case MVT::v8i8:  Opc = DOpcodes[0]; break;
-  case MVT::v16i8: Opc = QOpcodes[0]; break;
-  case MVT::v4i16: Opc = DOpcodes[1]; break;
-  case MVT::v8i16: Opc = QOpcodes[1]; break;
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
  case MVT::v2f32:
-  case MVT::v2i32: Opc = DOpcodes[2]; break;
-  case MVT::v4f32:
-  case MVT::v4i32: Opc = QOpcodes[2]; break;
+  case MVT::v2i32: OpcodeIndex = 2; break;
  }

  SDValue Pred = getAL(CurDAG, dl);
  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDValue SuperReg;
+  unsigned Opc = Opcodes[OpcodeIndex];
  SmallVector<SDValue, 6> Ops;
  Ops.push_back(MemAddr);
  Ops.push_back(Align);
@ -2321,8 +2305,6 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
    // fixed-stride update instructions don't have an explicit writeback
    // operand. It's implicit in the opcode itself.
    SDValue Inc = N->getOperand(2);
-    if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
    if (!isa<ConstantSDNode>(Inc.getNode()))
      Ops.push_back(Inc);
    // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
@ -2341,18 +2323,14 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
  ResTys.push_back(MVT::Other);
  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+  SuperReg = SDValue(VLdDup, 0);

  // Extract the subregisters.
-  if (NumVecs == 1) {
-    ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0));
-  } else {
-    SDValue SuperReg = SDValue(VLdDup, 0);
-    static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
-    unsigned SubIdx = ARM::dsub_0;
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      ReplaceUses(SDValue(N, Vec),
-                  CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
-  }
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
+  unsigned SubIdx = ARM::dsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
  if (isUpdating)
    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
@ -3424,15 +3402,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
    return;
  }

-  case ARMISD::VLD1DUP: {
-    static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16,
-                                         ARM::VLD1DUPd32 };
-    static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
-                                         ARM::VLD1DUPq32 };
-    SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
-    return;
-  }
-
  case ARMISD::VLD2DUP: {
    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                        ARM::VLD2DUPd32 };
@ -3456,17 +3425,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
    return;
  }

-  case ARMISD::VLD1DUP_UPD: {
-    static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed,
-                                         ARM::VLD1DUPd16wb_fixed,
-                                         ARM::VLD1DUPd32wb_fixed };
-    static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
-                                         ARM::VLD1DUPq16wb_fixed,
-                                         ARM::VLD1DUPq32wb_fixed };
-    SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
-    return;
-  }
-
  case ARMISD::VLD2DUP_UPD: {
    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                        ARM::VLD2DUPd16wb_fixed,
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -1428,7 +1428,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
  case ARMISD::VBSL:          return "ARMISD::VBSL";
  case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
-  case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
@ -1439,7 +1438,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
-  case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
@ -10474,7 +10472,6 @@ static SDValue CombineBaseUpdate(SDNode *N,
      isLaneOp = true;
      switch (N->getOpcode()) {
      default: llvm_unreachable("unexpected opcode for Neon base update");
-      case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
@ -10589,8 +10586,8 @@ static SDValue CombineBaseUpdate(SDNode *N,
      StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
    }

-    EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
+                                           Ops, AlignedVecTy,
                                           MemN->getMemOperand());

    // Update the uses.
@ -10735,30 +10732,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 }

-/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
-static SDValue PerformVDUPCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Op = N->getOperand(0);
-
-  // Match VDUP(LOAD) -> VLD1DUP.
-  // We match this pattern here rather than waiting for isel because the
-  // transform is only legal for unindexed loads.
-  LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
-  if (LD && Op.hasOneUse() && LD->isUnindexed()) {
-    SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
-                      DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
-    SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
-    SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
-                                             Ops, LD->getMemoryVT(),
-                                             LD->getMemOperand());
-    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
-    return VLDDup;
-  }
-
-  return SDValue();
-}
-
 static SDValue PerformLOADCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
  EVT VT = N->getValueType(0);
@ -11586,7 +11559,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
-  case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
  case ISD::FP_TO_SINT:
  case ISD::FP_TO_UINT:
    return PerformVCVTCombine(N, DCI.DAG, Subtarget);
@ -11602,7 +11574,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
  case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
  case ISD::LOAD:       return PerformLOADCombine(N, DCI);
-  case ARMISD::VLD1DUP:
  case ARMISD::VLD2DUP:
  case ARMISD::VLD3DUP:
  case ARMISD::VLD4DUP:
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@ -190,8 +190,7 @@ namespace llvm {
      MEMCPY,

      // Vector load N-element structure to all lanes:
-      VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      VLD2DUP,
+      VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
      VLD3DUP,
      VLD4DUP,

@ -203,7 +202,6 @@ namespace llvm {
      VLD2LN_UPD,
      VLD3LN_UPD,
      VLD4LN_UPD,
-      VLD1DUP_UPD,
      VLD2DUP_UPD,
      VLD3DUP_UPD,
      VLD4DUP_UPD,
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@ -10,84 +10,6 @@ define <8 x i8> @vld1dupi8(i8* %A) nounwind {
        ret <8 x i8> %tmp3
 }

-define <8 x i8> @vld1dupi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind {
-entry:
-;CHECK-LABEL: vld1dupi8_preinc:
-;CHECK: vld1.8 {d16[]}, [r1]
-  %0 = load i8*, i8** %a, align 4
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
-  %1 = load i8, i8* %add.ptr, align 1
-  %2 = insertelement <8 x i8> undef, i8 %1, i32 0
-  %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  store i8* %add.ptr, i8** %a, align 4
-  ret <8 x i8> %lane
-}
-
-define <8 x i8> @vld1dupi8_postinc_fixed(i8** noalias nocapture %a) nounwind {
-entry:
-;CHECK-LABEL: vld1dupi8_postinc_fixed:
-;CHECK: vld1.8 {d16[]}, [r1]!
-  %0 = load i8*, i8** %a, align 4
-  %1 = load i8, i8* %0, align 1
-  %2 = insertelement <8 x i8> undef, i8 %1, i32 0
-  %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 1
-  store i8* %add.ptr, i8** %a, align 4
-  ret <8 x i8> %lane
-}
-
-define <8 x i8> @vld1dupi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind {
-entry:
-;CHECK-LABEL: vld1dupi8_postinc_register:
-;CHECK: vld1.8 {d16[]}, [r2], r1
-  %0 = load i8*, i8** %a, align 4
-  %1 = load i8, i8* %0, align 1
-  %2 = insertelement <8 x i8> undef, i8 %1, i32 0
-  %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
-  store i8* %add.ptr, i8** %a, align 4
-  ret <8 x i8> %lane
-}
-
-define <16 x i8> @vld1dupqi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind {
-entry:
-;CHECK-LABEL: vld1dupqi8_preinc:
-;CHECK: vld1.8 {d16[], d17[]}, [r1]
-  %0 = load i8*, i8** %a, align 4
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
-  %1 = load i8, i8* %add.ptr, align 1
-  %2 = insertelement <16 x i8> undef, i8 %1, i32 0
-  %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  store i8* %add.ptr, i8** %a, align 4
-  ret <16 x i8> %lane
-}
-
-define <16 x i8> @vld1dupqi8_postinc_fixed(i8** noalias nocapture %a) nounwind {
-entry:
-;CHECK-LABEL: vld1dupqi8_postinc_fixed:
-;CHECK: vld1.8 {d16[], d17[]}, [r1]!
-  %0 = load i8*, i8** %a, align 4
-  %1 = load i8, i8* %0, align 1
-  %2 = insertelement <16 x i8> undef, i8 %1, i32 0
-  %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 1
-  store i8* %add.ptr, i8** %a, align 4
-  ret <16 x i8> %lane
-}
-
-define <16 x i8> @vld1dupqi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind {
-entry:
-;CHECK-LABEL: vld1dupqi8_postinc_register:
-;CHECK: vld1.8 {d16[], d17[]}, [r2], r1
-  %0 = load i8*, i8** %a, align 4
-  %1 = load i8, i8* %0, align 1
-  %2 = insertelement <16 x i8> undef, i8 %1, i32 0
-  %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
-  store i8* %add.ptr, i8** %a, align 4
-  ret <16 x i8> %lane
-}
-
 define <4 x i16> @vld1dupi16(i16* %A) nounwind {
 ;CHECK-LABEL: vld1dupi16:
 ;Check the alignment value.  Max for this instruction is 16 bits:
@ -98,15 +20,6 @@ define <4 x i16> @vld1dupi16(i16* %A) nounwind {
        ret <4 x i16> %tmp3
 }

-define <4 x i16> @vld1dupi16_misaligned(i16* %A) nounwind {
-;CHECK-LABEL: vld1dupi16_misaligned:
-;CHECK: vld1.16 {d16[]}, [r0]
-	%tmp1 = load i16, i16* %A, align 1
-	%tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
-	%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
-        ret <4 x i16> %tmp3
-}
-
 define <2 x i32> @vld1dupi32(i32* %A) nounwind {
 ;CHECK-LABEL: vld1dupi32:
 ;Check the alignment value.  Max for this instruction is 32 bits:
@ -162,63 +75,6 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
        ret <8 x i8> %tmp5
 }

-define void @vld2dupi8_preinc(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %b) nounwind {
-;CHECK-LABEL: vld2dupi8_preinc:
-;CHECK: vld2.8 {d16[], d17[]}, [r2]
-entry:
-  %0 = load i8*, i8** %a, align 4
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
-  %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  store i8* %add.ptr, i8** %a, align 4
-  %r8 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
-  store <8 x i8> %lane, <8 x i8>* %r8, align 8
-  %r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
-  store <8 x i8> %lane1, <8 x i8>* %r11, align 8
-  ret void
-}
-
-define void @vld2dupi8_postinc_fixed(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a) nounwind {
-entry:
-;CHECK-LABEL: vld2dupi8_postinc_fixed:
-;CHECK: vld2.8 {d16[], d17[]}, [r2]!
-  %0 = load i8*, i8** %a, align 4
-  %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 2
-  store i8* %add.ptr, i8** %a, align 4
-  %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
-  store <8 x i8> %lane, <8 x i8>* %r7, align 8
-  %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
-  store <8 x i8> %lane1, <8 x i8>* %r10, align 8
-  ret void
-}
-
-define void @vld2dupi8_postinc_variable(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %n) nounwind {
-entry:
-;CHECK-LABEL: vld2dupi8_postinc_variable:
-;CHECK: vld2.8 {d16[], d17[]}, [r3], r2
-  %0 = load i8*, i8** %a, align 4
-  %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
-  store i8* %add.ptr, i8** %a, align 4
-  %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
-  store <8 x i8> %lane, <8 x i8>* %r7, align 8
-  %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
-  store <8 x i8> %lane1, <8 x i8>* %r10, align 8
-  ret void
-}
-
 define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;CHECK-LABEL: vld2dupi16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@ -635,26 +635,13 @@ entry:
  ret void
 }

-define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind {
-; Look for a scalar float rather than a splat, then a vector*scalar multiply.
-; CHECK: vmov s0, r2
+define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+;   Look for doing a normal scalar FP load rather than an to-all-lanes load.
+;   e.g., "ldr s0, [r2]" rathern than "vld1.32  {d18[], d19[]}, [r2:32]"
+;   Then check that the vector multiply has folded the splat to all lanes
+;   and used a vector * scalar instruction.
+; CHECK: vldr  {{s[0-9]+}}, [r2]
 ; CHECK: vmul.f32  q8, q8, d0[0]
-  %tmp5 = load <4 x float>, <4 x float>* %a, align 4
-  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
-  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
-  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
-  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
-  %tmp10 = fmul <4 x float> %tmp9, %tmp5
-  store <4 x float> %tmp10, <4 x float>* %dst, align 4
-  ret void
-}
-
-define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
-; Look for doing a normal scalar FP load rather than an to-all-lanes load,
-; then a vector*scalar multiply.
-; FIXME: Temporarily broken due to splat representation changes.
-; CHECK: vld1.32 {d18[], d19[]}, [r2:32]
-; CHECK: vmul.f32  q8, q9, q8
  %tmp = load float, float* %src, align 4
  %tmp5 = load <4 x float>, <4 x float>* %a, align 4
  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0