diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h index 2c37f448cad..37f8bbfd2c7 100644 --- a/include/llvm/CodeGen/TargetLowering.h +++ b/include/llvm/CodeGen/TargetLowering.h @@ -2707,6 +2707,30 @@ public: bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, DAGCombinerInfo &DCI) const; + /// Look at Vector Op. At this point, we know that only the DemandedElts + /// elements of the result of Op are ever used downstream. If we can use + /// this information to simplify Op, create a new simplified DAG node and + /// return true, storing the original and new nodes in TLO. + /// Otherwise, analyze the expression and return a mask of KnownUndef and + /// KnownZero elements for the expression (used to simplify the caller). + /// The KnownUndef/Zero elements may only be accurate for those bits + /// in the DemandedMask. + /// \p AssumeSingleUse When this parameter is true, this function will + /// attempt to simplify \p Op even if there are multiple uses. + /// Callers are responsible for correctly updating the DAG based on the + /// results of this function, because simply replacing replacing TLO.Old + /// with TLO.New will be incorrect when this parameter is true and TLO.Old + /// has multiple uses. + bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, + APInt &KnownUndef, APInt &KnownZero, + TargetLoweringOpt &TLO, unsigned Depth = 0, + bool AssumeSingleUse = false) const; + + /// Helper wrapper around SimplifyDemandedVectorElts + bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, + APInt &KnownUndef, APInt &KnownZero, + DAGCombinerInfo &DCI) const; + /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts /// argument allows us to only collect the known bits that are shared by the @@ -2735,6 +2759,15 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const; + /// Attempt to simplify any target nodes based on the demanded vector + /// elements, returning true on success. Otherwise, analyze the expression and + /// return a mask of KnownUndef and KnownZero elements for the expression + /// (used to simplify the caller). The KnownUndef/Zero elements may only be + /// accurate for those bits in the DemandedMask + virtual bool SimplifyDemandedVectorEltsForTargetNode( + SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, + APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const; + struct DAGCombinerInfo { void *DC; // The DAG Combiner object. CombineLevel Level; diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3d2ee5eff2a..19d201ed932 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -232,7 +232,17 @@ namespace { return SimplifyDemandedBits(Op, Demanded); } + /// Check the specified vector node value to see if it can be simplified or + /// if things it uses can be simplified as it only uses some of the + /// elements. If so, return true. + bool SimplifyDemandedVectorElts(SDValue Op) { + unsigned NumElts = Op.getValueType().getVectorNumElements(); + APInt Demanded = APInt::getAllOnesValue(NumElts); + return SimplifyDemandedVectorElts(Op, Demanded); + } + bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded); + bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded); bool CombineToPreIndexedLoadStore(SDNode *N); bool CombineToPostIndexedLoadStore(SDNode *N); @@ -1085,6 +1095,28 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { return true; } +/// Check the specified vector node value to see if it can be simplified or +/// if things it uses can be simplified as it only uses some of the elements. +/// If so, return true. +bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, + const APInt &Demanded) { + TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); + APInt KnownUndef, KnownZero; + if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO)) + return false; + + // Revisit the node. + AddToWorklist(Op.getNode()); + + // Replace the old value with the new one. + ++NodesCombined; + DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); + dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n'); + + CommitTargetLoweringOpt(TLO); + return true; +} + void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { SDLoc DL(Load); EVT VT = Load->getValueType(0); @@ -15558,92 +15590,6 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { return SDValue(); } -static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements, - SDValue V, SelectionDAG &DAG) { - SDLoc DL(V); - EVT VT = V.getValueType(); - - switch (V.getOpcode()) { - default: - return V; - - case ISD::CONCAT_VECTORS: { - EVT OpVT = V->getOperand(0).getValueType(); - int OpSize = OpVT.getVectorNumElements(); - SmallBitVector OpUsedElements(OpSize, false); - bool FoundSimplification = false; - SmallVector NewOps; - NewOps.reserve(V->getNumOperands()); - for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) { - SDValue Op = V->getOperand(i); - bool OpUsed = false; - for (int j = 0; j < OpSize; ++j) - if (UsedElements[i * OpSize + j]) { - OpUsedElements[j] = true; - OpUsed = true; - } - NewOps.push_back( - OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG) - : DAG.getUNDEF(OpVT)); - FoundSimplification |= Op == NewOps.back(); - OpUsedElements.reset(); - } - if (FoundSimplification) - V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps); - return V; - } - - case ISD::INSERT_SUBVECTOR: { - SDValue BaseV = V->getOperand(0); - SDValue SubV = V->getOperand(1); - auto *IdxN = dyn_cast(V->getOperand(2)); - if (!IdxN) - return V; - - int SubSize = SubV.getValueType().getVectorNumElements(); - int Idx = IdxN->getZExtValue(); - bool SubVectorUsed = false; - SmallBitVector SubUsedElements(SubSize, false); - for (int i = 0; i < SubSize; ++i) - if (UsedElements[i + Idx]) { - SubVectorUsed = true; - SubUsedElements[i] = true; - UsedElements[i + Idx] = false; - } - - // Now recurse on both the base and sub vectors. - SDValue SimplifiedSubV = - SubVectorUsed - ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG) - : DAG.getUNDEF(SubV.getValueType()); - SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG); - if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV) - V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - SimplifiedBaseV, SimplifiedSubV, V->getOperand(2)); - return V; - } - } -} - -static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, - SDValue N1, SelectionDAG &DAG) { - EVT VT = SVN->getValueType(0); - int NumElts = VT.getVectorNumElements(); - SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false); - for (int M : SVN->getMask()) - if (M >= 0 && M < NumElts) - N0UsedElements[M] = true; - else if (M >= NumElts) - N1UsedElements[M - NumElts] = true; - - SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG); - SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG); - if (S0 == N0 && S1 == N1) - return SDValue(); - - return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); -} - static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0, SDValue N1, SelectionDAG &DAG) { auto isUndefElt = [](SDValue V, int Idx) { @@ -16181,11 +16127,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } } - // There are various patterns used to build up a vector from smaller vectors, - // subvectors, or elements. Scan chains of these and replace unused insertions - // or components with undef. - if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG)) - return S; + // Simplify source operands based on shuffle mask. + if (SimplifyDemandedVectorElts(SDValue(N, 0))) + return SDValue(N, 0); // Match shuffles that can be converted to any_vector_extend_in_reg. if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes)) diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 79f7d16acb2..a066217ba17 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1279,6 +1279,197 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return false; } +bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op, + const APInt &DemandedElts, + APInt &KnownUndef, + APInt &KnownZero, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + + bool Simplified = + SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO); + if (Simplified) + DCI.CommitTargetLoweringOpt(TLO); + return Simplified; +} + +bool TargetLowering::SimplifyDemandedVectorElts( + SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, + APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth, + bool AssumeSingleUse) const { + EVT VT = Op.getValueType(); + APInt DemandedElts = DemandedEltMask; + unsigned NumElts = DemandedElts.getBitWidth(); + assert(VT.isVector() && "Expected vector op"); + assert(VT.getVectorNumElements() == NumElts && + "Mask size mismatches value type element count!"); + + KnownUndef = KnownZero = APInt::getNullValue(NumElts); + + // Undef operand. + if (Op.isUndef()) { + KnownUndef.setAllBits(); + return false; + } + + // If Op has other users, assume that all elements are needed. + if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) + DemandedElts.setAllBits(); + + // Not demanding any elements from Op. + if (DemandedElts == 0) { + KnownUndef.setAllBits(); + return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); + } + + // Limit search depth. + if (Depth >= 6) + return false; + + SDLoc DL(Op); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + + switch (Op.getOpcode()) { + case ISD::SCALAR_TO_VECTOR: { + if (!DemandedElts[0]) { + KnownUndef.setAllBits(); + return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); + } + KnownUndef.setHighBits(NumElts - 1); + break; + } + case ISD::BUILD_VECTOR: { + // Check all elements and simplify any unused elements with UNDEF. + if (!DemandedElts.isAllOnesValue()) { + // Don't simplify BROADCASTS. + if (llvm::any_of(Op->op_values(), + [&](SDValue Elt) { return Op.getOperand(0) != Elt; })) { + SmallVector Ops(Op->op_begin(), Op->op_end()); + bool Updated = false; + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i] && !Ops[i].isUndef()) { + Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType()); + KnownUndef.setBit(i); + Updated = true; + } + } + if (Updated) + return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops)); + } + } + for (unsigned i = 0; i != NumElts; ++i) { + SDValue SrcOp = Op.getOperand(i); + if (SrcOp.isUndef()) { + KnownUndef.setBit(i); + } else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() && + (isNullConstant(SrcOp) || isNullFPConstant(SrcOp))) { + KnownZero.setBit(i); + } + } + break; + } + case ISD::CONCAT_VECTORS: { + EVT SubVT = Op.getOperand(0).getValueType(); + unsigned NumSubVecs = Op.getNumOperands(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + for (unsigned i = 0; i != NumSubVecs; ++i) { + SDValue SubOp = Op.getOperand(i); + APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts); + APInt SubUndef, SubZero; + if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO, + Depth + 1)) + return true; + KnownUndef.insertBits(SubUndef, i * NumSubElts); + KnownZero.insertBits(SubZero, i * NumSubElts); + } + break; + } + case ISD::INSERT_SUBVECTOR: { + if (!isa(Op.getOperand(2))) + break; + SDValue Base = Op.getOperand(0); + SDValue Sub = Op.getOperand(1); + EVT SubVT = Sub.getValueType(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + APInt Idx = cast(Op.getOperand(2))->getAPIntValue(); + if (Idx.uge(NumElts - NumSubElts)) + break; + unsigned SubIdx = Idx.getZExtValue(); + APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx); + APInt SubUndef, SubZero; + if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO, + Depth + 1)) + return true; + APInt BaseElts = DemandedElts; + BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx); + if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO, + Depth + 1)) + return true; + KnownUndef.insertBits(SubUndef, SubIdx); + KnownZero.insertBits(SubZero, SubIdx); + break; + } + case ISD::VECTOR_SHUFFLE: { + ArrayRef ShuffleMask = cast(Op)->getMask(); + + // Collect demanded elements from shuffle operands.. + APInt DemandedLHS(NumElts, 0); + APInt DemandedRHS(NumElts, 0); + for (unsigned i = 0; i != NumElts; ++i) { + int M = ShuffleMask[i]; + if (M < 0 || !DemandedElts[i]) + continue; + assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range"); + if (M < (int)NumElts) + DemandedLHS.setBit(M); + else + DemandedRHS.setBit(M - NumElts); + } + + // See if we can simplify either shuffle operand. + APInt UndefLHS, ZeroLHS; + APInt UndefRHS, ZeroRHS; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, UndefLHS, + ZeroLHS, TLO, Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, UndefRHS, + ZeroRHS, TLO, Depth + 1)) + return true; + + // Propagate undef/zero elements from LHS/RHS. + for (unsigned i = 0; i != NumElts; ++i) { + int M = ShuffleMask[i]; + if (M < 0) { + KnownUndef.setBit(i); + } else if (M < (int)NumElts) { + if (UndefLHS[M]) + KnownUndef.setBit(i); + if (ZeroLHS[M]) + KnownZero.setBit(i); + } else { + if (UndefRHS[M - NumElts]) + KnownUndef.setBit(i); + if (ZeroRHS[M - NumElts]) + KnownZero.setBit(i); + } + } + break; + } + default: { + if (Op.getOpcode() >= ISD::BUILTIN_OP_END) + if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef, + KnownZero, TLO, Depth)) + return true; + break; + } + } + + assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero"); + return false; +} + /// Determine which of the bits specified in Mask are known to be either zero or /// one and return them in the Known. void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, @@ -1323,6 +1514,18 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, return 1; } +bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode( + SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, + TargetLoweringOpt &TLO, unsigned Depth) const { + assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || + Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_VOID) && + "Should use SimplifyDemandedVectorElts if you don't know whether Op" + " is a target node!"); + return false; +} + // FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must // work with truncating build vectors and vectors with elements of less than // 8 bits. diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll index 750ea75716e..f7b19f0476f 100644 --- a/test/CodeGen/Mips/cconv/vector.ll +++ b/test/CodeGen/Mips/cconv/vector.ll @@ -50,40 +50,40 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) { ; ; MIPS32R5EB-LABEL: i8_2: ; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -16 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 16 -; MIPS32R5EB-NEXT: sw $5, 8($sp) -; MIPS32R5EB-NEXT: sw $4, 12($sp) -; MIPS32R5EB-NEXT: ldi.b $w0, 0 -; MIPS32R5EB-NEXT: lbu $1, 9($sp) -; MIPS32R5EB-NEXT: lbu $2, 8($sp) -; MIPS32R5EB-NEXT: move.v $w1, $w0 -; MIPS32R5EB-NEXT: insert.w $w1[0], $2 -; MIPS32R5EB-NEXT: insert.w $w1[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 12($sp) -; MIPS32R5EB-NEXT: insert.w $w0[0], $1 -; MIPS32R5EB-NEXT: lbu $1, 10($sp) -; MIPS32R5EB-NEXT: lbu $2, 13($sp) -; MIPS32R5EB-NEXT: insert.w $w0[1], $2 -; MIPS32R5EB-NEXT: insert.w $w1[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 11($sp) -; MIPS32R5EB-NEXT: insert.w $w1[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 -; MIPS32R5EB-NEXT: lbu $1, 14($sp) -; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 -; MIPS32R5EB-NEXT: insert.w $w0[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 15($sp) -; MIPS32R5EB-NEXT: insert.w $w0[3], $1 +; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 +; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: move $fp, $sp +; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EB-NEXT: addiu $1, $zero, -16 +; MIPS32R5EB-NEXT: and $sp, $sp, $1 +; MIPS32R5EB-NEXT: sw $5, 36($sp) +; MIPS32R5EB-NEXT: sw $4, 40($sp) +; MIPS32R5EB-NEXT: lbu $1, 37($sp) +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: lbu $1, 36($sp) +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: lbu $1, 40($sp) +; MIPS32R5EB-NEXT: lbu $2, 41($sp) +; MIPS32R5EB-NEXT: sw $2, 4($sp) +; MIPS32R5EB-NEXT: sw $1, 0($sp) +; MIPS32R5EB-NEXT: ld.w $w0, 16($sp) ; MIPS32R5EB-NEXT: ilvr.w $w0, $w0, $w0 ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 -; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EB-NEXT: ld.w $w1, 0($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0 ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 ; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5EB-NEXT: sb $2, 5($sp) -; MIPS32R5EB-NEXT: sb $1, 4($sp) -; MIPS32R5EB-NEXT: lhu $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $sp, $sp, 16 +; MIPS32R5EB-NEXT: sb $2, 33($sp) +; MIPS32R5EB-NEXT: sb $1, 32($sp) +; MIPS32R5EB-NEXT: lhu $2, 32($sp) +; MIPS32R5EB-NEXT: move $sp, $fp +; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; @@ -179,37 +179,37 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) { ; ; MIPS32R5EL-LABEL: i8_2: ; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -16 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 16 -; MIPS32R5EL-NEXT: sw $5, 8($sp) -; MIPS32R5EL-NEXT: sw $4, 12($sp) -; MIPS32R5EL-NEXT: ldi.b $w0, 0 -; MIPS32R5EL-NEXT: lbu $1, 9($sp) -; MIPS32R5EL-NEXT: lbu $2, 12($sp) -; MIPS32R5EL-NEXT: lbu $3, 8($sp) -; MIPS32R5EL-NEXT: move.v $w1, $w0 -; MIPS32R5EL-NEXT: insert.w $w1[0], $3 -; MIPS32R5EL-NEXT: insert.w $w0[0], $2 -; MIPS32R5EL-NEXT: insert.w $w1[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 10($sp) -; MIPS32R5EL-NEXT: insert.w $w1[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 11($sp) -; MIPS32R5EL-NEXT: insert.w $w1[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 -; MIPS32R5EL-NEXT: lbu $1, 13($sp) -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 14($sp) -; MIPS32R5EL-NEXT: insert.w $w0[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 15($sp) -; MIPS32R5EL-NEXT: insert.w $w0[3], $1 +; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 +; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: move $fp, $sp +; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EL-NEXT: addiu $1, $zero, -16 +; MIPS32R5EL-NEXT: and $sp, $sp, $1 +; MIPS32R5EL-NEXT: sw $5, 36($sp) +; MIPS32R5EL-NEXT: sw $4, 40($sp) +; MIPS32R5EL-NEXT: lbu $1, 37($sp) +; MIPS32R5EL-NEXT: sw $1, 20($sp) +; MIPS32R5EL-NEXT: lbu $1, 36($sp) +; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: lbu $1, 41($sp) +; MIPS32R5EL-NEXT: sw $1, 4($sp) +; MIPS32R5EL-NEXT: lbu $1, 40($sp) +; MIPS32R5EL-NEXT: sw $1, 0($sp) +; MIPS32R5EL-NEXT: ld.w $w0, 16($sp) ; MIPS32R5EL-NEXT: ilvr.w $w0, $w0, $w0 -; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EL-NEXT: ld.w $w1, 0($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0 ; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] -; MIPS32R5EL-NEXT: sb $2, 5($sp) -; MIPS32R5EL-NEXT: sb $1, 4($sp) -; MIPS32R5EL-NEXT: lhu $2, 4($sp) -; MIPS32R5EL-NEXT: addiu $sp, $sp, 16 +; MIPS32R5EL-NEXT: sb $2, 33($sp) +; MIPS32R5EL-NEXT: sb $1, 32($sp) +; MIPS32R5EL-NEXT: lhu $2, 32($sp) +; MIPS32R5EL-NEXT: move $sp, $fp +; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop ; @@ -364,102 +364,82 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x ; ; MIPS32R5EB-LABEL: i8x2_7: ; MIPS32R5EB: # %bb.0: # %entry -; MIPS32R5EB-NEXT: addiu $sp, $sp, -24 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24 -; MIPS32R5EB-NEXT: sw $5, 16($sp) -; MIPS32R5EB-NEXT: sw $4, 20($sp) -; MIPS32R5EB-NEXT: ldi.b $w0, 0 -; MIPS32R5EB-NEXT: lbu $1, 17($sp) -; MIPS32R5EB-NEXT: lbu $2, 16($sp) -; MIPS32R5EB-NEXT: move.v $w1, $w0 -; MIPS32R5EB-NEXT: insert.w $w1[0], $2 -; MIPS32R5EB-NEXT: insert.w $w1[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 18($sp) -; MIPS32R5EB-NEXT: lbu $2, 21($sp) -; MIPS32R5EB-NEXT: lbu $3, 20($sp) -; MIPS32R5EB-NEXT: move.v $w2, $w0 -; MIPS32R5EB-NEXT: insert.w $w2[0], $3 -; MIPS32R5EB-NEXT: insert.w $w2[1], $2 -; MIPS32R5EB-NEXT: insert.w $w1[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 19($sp) -; MIPS32R5EB-NEXT: insert.w $w1[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 -; MIPS32R5EB-NEXT: lbu $1, 22($sp) -; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 -; MIPS32R5EB-NEXT: insert.w $w2[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 23($sp) -; MIPS32R5EB-NEXT: insert.w $w2[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177 -; MIPS32R5EB-NEXT: addv.d $w1, $w2, $w1 -; MIPS32R5EB-NEXT: sw $6, 12($sp) -; MIPS32R5EB-NEXT: lbu $1, 13($sp) -; MIPS32R5EB-NEXT: lbu $2, 12($sp) -; MIPS32R5EB-NEXT: move.v $w2, $w0 -; MIPS32R5EB-NEXT: insert.w $w2[0], $2 -; MIPS32R5EB-NEXT: insert.w $w2[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 14($sp) -; MIPS32R5EB-NEXT: insert.w $w2[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 15($sp) -; MIPS32R5EB-NEXT: insert.w $w2[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177 -; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EB-NEXT: sw $7, 8($sp) -; MIPS32R5EB-NEXT: lbu $1, 9($sp) -; MIPS32R5EB-NEXT: lbu $2, 8($sp) -; MIPS32R5EB-NEXT: move.v $w2, $w0 -; MIPS32R5EB-NEXT: insert.w $w2[0], $2 -; MIPS32R5EB-NEXT: insert.w $w2[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 10($sp) -; MIPS32R5EB-NEXT: insert.w $w2[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 11($sp) -; MIPS32R5EB-NEXT: insert.w $w2[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177 -; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EB-NEXT: lbu $1, 41($sp) -; MIPS32R5EB-NEXT: lbu $2, 40($sp) -; MIPS32R5EB-NEXT: move.v $w2, $w0 -; MIPS32R5EB-NEXT: insert.w $w2[0], $2 -; MIPS32R5EB-NEXT: insert.w $w2[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 42($sp) -; MIPS32R5EB-NEXT: insert.w $w2[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 43($sp) -; MIPS32R5EB-NEXT: insert.w $w2[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177 -; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EB-NEXT: lbu $1, 45($sp) -; MIPS32R5EB-NEXT: lbu $2, 44($sp) -; MIPS32R5EB-NEXT: move.v $w2, $w0 -; MIPS32R5EB-NEXT: insert.w $w2[0], $2 -; MIPS32R5EB-NEXT: insert.w $w2[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 46($sp) -; MIPS32R5EB-NEXT: insert.w $w2[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 47($sp) -; MIPS32R5EB-NEXT: insert.w $w2[3], $1 -; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177 -; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EB-NEXT: lbu $1, 48($sp) -; MIPS32R5EB-NEXT: insert.w $w0[0], $1 -; MIPS32R5EB-NEXT: lbu $1, 49($sp) -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 50($sp) -; MIPS32R5EB-NEXT: insert.w $w0[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 51($sp) -; MIPS32R5EB-NEXT: insert.w $w0[3], $1 +; MIPS32R5EB-NEXT: addiu $sp, $sp, -144 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 144 +; MIPS32R5EB-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: move $fp, $sp +; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EB-NEXT: addiu $1, $zero, -16 +; MIPS32R5EB-NEXT: and $sp, $sp, $1 +; MIPS32R5EB-NEXT: sw $5, 132($sp) +; MIPS32R5EB-NEXT: sw $4, 136($sp) +; MIPS32R5EB-NEXT: lbu $1, 133($sp) +; MIPS32R5EB-NEXT: sw $1, 68($sp) +; MIPS32R5EB-NEXT: lbu $1, 132($sp) +; MIPS32R5EB-NEXT: sw $1, 64($sp) +; MIPS32R5EB-NEXT: lbu $1, 136($sp) +; MIPS32R5EB-NEXT: lbu $2, 137($sp) +; MIPS32R5EB-NEXT: sw $2, 52($sp) +; MIPS32R5EB-NEXT: sw $1, 48($sp) +; MIPS32R5EB-NEXT: ld.w $w0, 64($sp) ; MIPS32R5EB-NEXT: ilvr.w $w0, $w0, $w0 ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 +; MIPS32R5EB-NEXT: ld.w $w1, 48($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 ; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0 +; MIPS32R5EB-NEXT: sw $6, 128($sp) +; MIPS32R5EB-NEXT: lbu $1, 129($sp) +; MIPS32R5EB-NEXT: sw $1, 84($sp) +; MIPS32R5EB-NEXT: lbu $1, 128($sp) +; MIPS32R5EB-NEXT: sw $1, 80($sp) +; MIPS32R5EB-NEXT: ld.w $w1, 80($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EB-NEXT: sw $7, 124($sp) +; MIPS32R5EB-NEXT: lbu $1, 125($sp) +; MIPS32R5EB-NEXT: sw $1, 100($sp) +; MIPS32R5EB-NEXT: lbu $1, 124($sp) +; MIPS32R5EB-NEXT: sw $1, 96($sp) +; MIPS32R5EB-NEXT: ld.w $w1, 96($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EB-NEXT: lbu $1, 161($fp) +; MIPS32R5EB-NEXT: sw $1, 4($sp) +; MIPS32R5EB-NEXT: lbu $1, 160($fp) +; MIPS32R5EB-NEXT: sw $1, 0($sp) +; MIPS32R5EB-NEXT: ld.w $w1, 0($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EB-NEXT: lbu $1, 165($fp) +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: lbu $1, 164($fp) +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: ld.w $w1, 16($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EB-NEXT: lbu $1, 169($fp) +; MIPS32R5EB-NEXT: sw $1, 36($sp) +; MIPS32R5EB-NEXT: lbu $1, 168($fp) +; MIPS32R5EB-NEXT: sw $1, 32($sp) +; MIPS32R5EB-NEXT: ld.w $w1, 32($sp) +; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 ; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5EB-NEXT: sb $2, 5($sp) -; MIPS32R5EB-NEXT: sb $1, 4($sp) -; MIPS32R5EB-NEXT: lhu $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $sp, $sp, 24 +; MIPS32R5EB-NEXT: sb $2, 121($sp) +; MIPS32R5EB-NEXT: sb $1, 120($sp) +; MIPS32R5EB-NEXT: lhu $2, 120($sp) +; MIPS32R5EB-NEXT: move $sp, $fp +; MIPS32R5EB-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 144 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; @@ -720,94 +700,74 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x ; ; MIPS32R5EL-LABEL: i8x2_7: ; MIPS32R5EL: # %bb.0: # %entry -; MIPS32R5EL-NEXT: addiu $sp, $sp, -24 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24 -; MIPS32R5EL-NEXT: sw $5, 16($sp) -; MIPS32R5EL-NEXT: ldi.b $w0, 0 -; MIPS32R5EL-NEXT: sw $4, 20($sp) -; MIPS32R5EL-NEXT: lbu $1, 17($sp) -; MIPS32R5EL-NEXT: lbu $2, 16($sp) -; MIPS32R5EL-NEXT: move.v $w1, $w0 -; MIPS32R5EL-NEXT: insert.w $w1[0], $2 -; MIPS32R5EL-NEXT: insert.w $w1[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 18($sp) -; MIPS32R5EL-NEXT: insert.w $w1[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 19($sp) -; MIPS32R5EL-NEXT: insert.w $w1[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 -; MIPS32R5EL-NEXT: lbu $1, 21($sp) -; MIPS32R5EL-NEXT: lbu $2, 20($sp) -; MIPS32R5EL-NEXT: move.v $w2, $w0 -; MIPS32R5EL-NEXT: insert.w $w2[0], $2 -; MIPS32R5EL-NEXT: insert.w $w2[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 22($sp) -; MIPS32R5EL-NEXT: insert.w $w2[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 23($sp) -; MIPS32R5EL-NEXT: insert.w $w2[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EL-NEXT: addv.d $w1, $w2, $w1 -; MIPS32R5EL-NEXT: sw $6, 12($sp) -; MIPS32R5EL-NEXT: lbu $1, 13($sp) -; MIPS32R5EL-NEXT: lbu $2, 12($sp) -; MIPS32R5EL-NEXT: move.v $w2, $w0 -; MIPS32R5EL-NEXT: insert.w $w2[0], $2 -; MIPS32R5EL-NEXT: insert.w $w2[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 14($sp) -; MIPS32R5EL-NEXT: insert.w $w2[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 15($sp) -; MIPS32R5EL-NEXT: insert.w $w2[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EL-NEXT: sw $7, 8($sp) -; MIPS32R5EL-NEXT: lbu $1, 9($sp) -; MIPS32R5EL-NEXT: lbu $2, 8($sp) -; MIPS32R5EL-NEXT: move.v $w2, $w0 -; MIPS32R5EL-NEXT: insert.w $w2[0], $2 -; MIPS32R5EL-NEXT: insert.w $w2[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 10($sp) -; MIPS32R5EL-NEXT: insert.w $w2[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 11($sp) -; MIPS32R5EL-NEXT: insert.w $w2[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EL-NEXT: lbu $1, 41($sp) -; MIPS32R5EL-NEXT: lbu $2, 40($sp) -; MIPS32R5EL-NEXT: move.v $w2, $w0 -; MIPS32R5EL-NEXT: insert.w $w2[0], $2 -; MIPS32R5EL-NEXT: insert.w $w2[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 42($sp) -; MIPS32R5EL-NEXT: insert.w $w2[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 43($sp) -; MIPS32R5EL-NEXT: insert.w $w2[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EL-NEXT: lbu $1, 45($sp) -; MIPS32R5EL-NEXT: lbu $2, 44($sp) -; MIPS32R5EL-NEXT: move.v $w2, $w0 -; MIPS32R5EL-NEXT: insert.w $w2[0], $2 -; MIPS32R5EL-NEXT: insert.w $w2[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 46($sp) -; MIPS32R5EL-NEXT: insert.w $w2[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 47($sp) -; MIPS32R5EL-NEXT: insert.w $w2[3], $1 -; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2 -; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2 -; MIPS32R5EL-NEXT: lbu $1, 48($sp) -; MIPS32R5EL-NEXT: insert.w $w0[0], $1 -; MIPS32R5EL-NEXT: lbu $1, 49($sp) -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 50($sp) -; MIPS32R5EL-NEXT: insert.w $w0[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 51($sp) -; MIPS32R5EL-NEXT: insert.w $w0[3], $1 +; MIPS32R5EL-NEXT: addiu $sp, $sp, -144 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 144 +; MIPS32R5EL-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: move $fp, $sp +; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EL-NEXT: addiu $1, $zero, -16 +; MIPS32R5EL-NEXT: and $sp, $sp, $1 +; MIPS32R5EL-NEXT: sw $5, 132($sp) +; MIPS32R5EL-NEXT: sw $4, 136($sp) +; MIPS32R5EL-NEXT: lbu $1, 133($sp) +; MIPS32R5EL-NEXT: sw $1, 68($sp) +; MIPS32R5EL-NEXT: lbu $1, 132($sp) +; MIPS32R5EL-NEXT: sw $1, 64($sp) +; MIPS32R5EL-NEXT: lbu $1, 137($sp) +; MIPS32R5EL-NEXT: sw $1, 52($sp) +; MIPS32R5EL-NEXT: lbu $1, 136($sp) +; MIPS32R5EL-NEXT: sw $1, 48($sp) +; MIPS32R5EL-NEXT: ld.w $w0, 64($sp) ; MIPS32R5EL-NEXT: ilvr.w $w0, $w0, $w0 +; MIPS32R5EL-NEXT: ld.w $w1, 48($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 ; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0 +; MIPS32R5EL-NEXT: sw $6, 128($sp) +; MIPS32R5EL-NEXT: lbu $1, 129($sp) +; MIPS32R5EL-NEXT: sw $1, 84($sp) +; MIPS32R5EL-NEXT: lbu $1, 128($sp) +; MIPS32R5EL-NEXT: sw $1, 80($sp) +; MIPS32R5EL-NEXT: ld.w $w1, 80($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EL-NEXT: sw $7, 124($sp) +; MIPS32R5EL-NEXT: lbu $1, 125($sp) +; MIPS32R5EL-NEXT: sw $1, 100($sp) +; MIPS32R5EL-NEXT: lbu $1, 124($sp) +; MIPS32R5EL-NEXT: sw $1, 96($sp) +; MIPS32R5EL-NEXT: ld.w $w1, 96($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EL-NEXT: lbu $1, 161($fp) +; MIPS32R5EL-NEXT: sw $1, 4($sp) +; MIPS32R5EL-NEXT: lbu $1, 160($fp) +; MIPS32R5EL-NEXT: sw $1, 0($sp) +; MIPS32R5EL-NEXT: ld.w $w1, 0($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EL-NEXT: lbu $1, 165($fp) +; MIPS32R5EL-NEXT: sw $1, 20($sp) +; MIPS32R5EL-NEXT: lbu $1, 164($fp) +; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: ld.w $w1, 16($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 +; MIPS32R5EL-NEXT: lbu $1, 169($fp) +; MIPS32R5EL-NEXT: sw $1, 36($sp) +; MIPS32R5EL-NEXT: lbu $1, 168($fp) +; MIPS32R5EL-NEXT: sw $1, 32($sp) +; MIPS32R5EL-NEXT: ld.w $w1, 32($sp) +; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1 +; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 ; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] -; MIPS32R5EL-NEXT: sb $2, 5($sp) -; MIPS32R5EL-NEXT: sb $1, 4($sp) -; MIPS32R5EL-NEXT: lhu $2, 4($sp) -; MIPS32R5EL-NEXT: addiu $sp, $sp, 24 +; MIPS32R5EL-NEXT: sb $2, 121($sp) +; MIPS32R5EL-NEXT: sb $1, 120($sp) +; MIPS32R5EL-NEXT: lhu $2, 120($sp) +; MIPS32R5EL-NEXT: move $sp, $fp +; MIPS32R5EL-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 144 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop ; diff --git a/test/CodeGen/X86/combine-sra.ll b/test/CodeGen/X86/combine-sra.ll index 4fa58e2aae1..82c39377d06 100644 --- a/test/CodeGen/X86/combine-sra.ll +++ b/test/CodeGen/X86/combine-sra.ll @@ -239,10 +239,7 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) { ; SSE-LABEL: combine_vec_ashr_trunc_ashr: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: psrad $2, %xmm1 diff --git a/test/CodeGen/X86/split-extend-vector-inreg.ll b/test/CodeGen/X86/split-extend-vector-inreg.ll index 84ba12961de..26f3e38cd29 100644 --- a/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -20,10 +20,7 @@ define <4 x i64> @autogen_SD88863() { ; ; X64-LABEL: autogen_SD88863: ; X64: # %bb.0: # %BB -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: movb $1, %al ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %CF @@ -31,6 +28,9 @@ define <4 x i64> @autogen_SD88863() { ; X64-NEXT: testb %al, %al ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %CF240 +; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; X64-NEXT: retq BB: %I26 = insertelement <4 x i64> undef, i64 undef, i32 2 diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 649a86dc1fc..17e8e51b8d4 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -917,8 +917,6 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %ptr = bitcast x86_mmx* %a1 to <2 x float>* @@ -948,8 +946,6 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index 09914e09faa..97cc18e296f 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -379,16 +379,12 @@ entry: define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { ; X86-LABEL: t16: ; X86: # %bb.0: # %entry -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: pslld $16, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: t16: ; X64: # %bb.0: # %entry -; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: retq entry: %tmp8 = shufflevector <16 x i8> , <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index 0ccd98983df..73c122ad3e3 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -511,10 +511,9 @@ define <8 x float> @expand14(<4 x float> %a) { ; ; KNL64-LABEL: expand14: ; KNL64: # %bb.0: -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; @@ -528,10 +527,9 @@ define <8 x float> @expand14(<4 x float> %a) { ; ; KNL32-LABEL: expand14: ; KNL32: # %bb.0: -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> , diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index c1572337bfc..3ae044b0064 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -985,9 +985,8 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1 ; ; X32-AVX512-LABEL: PR34577: ; X32-AVX512: # %bb.0: # %entry -; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0> -; X32-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; X32-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; X32-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = ; X32-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1 @@ -1006,9 +1005,8 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1 ; ; X64-AVX512-LABEL: PR34577: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0> -; X64-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; X64-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = ; X64-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1 diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll index d3597564afd..83ae1e62747 100644 --- a/test/CodeGen/X86/vector-shuffle-sse1.ll +++ b/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -237,8 +237,6 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE1-NEXT: xorps %xmm2, %xmm2 -; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE1-NEXT: movaps %xmm1, %xmm0 ; SSE1-NEXT: retq @@ -258,8 +256,6 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE1-NEXT: xorps %xmm2, %xmm2 -; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE1-NEXT: retq %a = load <2 x float>, <2 x float>* %ptr diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 901d83c75df..f2efa878c61 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -710,7 +710,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index 9171805ac54..6f6cc51fd5f 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -54,61 +54,19 @@ entry: } define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { -; SSE2-LABEL: trunc8i64_8i32_ashr: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2] -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i64_8i32_ashr: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2] -; SSSE3-NEXT: movaps %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i64_8i32_ashr: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7] -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[0,2] -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i64_8i32_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc8i64_8i32_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ;