1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[SystemZ] Handle sub-128 vectors

The ABI allows sub-128 vectors to be passed and returned in registers,
with the vector occupying the upper part of a register.  We therefore
want to legalize those types by widening the vector rather than promoting
the elements.

The patch includes some simple tests for sub-128 vectors and also tests
that we can recognize various pack sequences, some of which use sub-128
vectors as temporary results.  One of these forms is based on the pack
sequences generated by llvmpipe when no intrinsics are used.

Signed unpacks are recognized as BUILD_VECTORs whose elements are
individually sign-extended.  Unsigned unpacks can have the equivalent
form with zero extension, but they also occur as shuffles in which some
elements are zero.

Based on a patch by Richard Sandiford.

llvm-svn: 236525
This commit is contained in:
Ulrich Weigand 2015-05-05 19:29:21 +00:00
parent 73b770d782
commit 096deccae0
20 changed files with 1175 additions and 29 deletions

View File

@ -28,6 +28,14 @@ private:
/// See ISD::OutputArg::IsFixed.
SmallVector<bool, 4> ArgIsFixed;
/// Records whether the value was widened from a short vector type.
SmallVector<bool, 4> ArgIsShortVector;
// Check whether ArgVT is a short vector type.
bool IsShortVectorType(EVT ArgVT) {
return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
}
public:
SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
@ -39,6 +47,10 @@ public:
ArgIsFixed.clear();
for (unsigned i = 0; i < Ins.size(); ++i)
ArgIsFixed.push_back(true);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Ins.size(); ++i)
ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
CCState::AnalyzeFormalArguments(Ins, Fn);
}
@ -49,6 +61,10 @@ public:
ArgIsFixed.clear();
for (unsigned i = 0; i < Outs.size(); ++i)
ArgIsFixed.push_back(Outs[i].IsFixed);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Outs.size(); ++i)
ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
CCState::AnalyzeCallOperands(Outs, Fn);
}
@ -60,6 +76,7 @@ public:
CCAssignFn Fn) = delete;
bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
};
} // end namespace llvm

View File

@ -21,6 +21,11 @@ class CCIfSubtarget<string F, CCAction A>
class CCIfFixed<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
// Match if this specific argument was widened from a short vector type.
class CCIfShortVector<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
//===----------------------------------------------------------------------===//
// z/Linux return value calling convention
//===----------------------------------------------------------------------===//
@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// Similarly for vectors, with V24 being the ABI-compliant choice.
// Sub-128 vectors are returned in the same way, but they're widened
// to one of these types during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
// The first 8 named vector arguments are passed in V24-V31.
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
// are passed in the same way, but they're widened to one of these types
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
V25, V27, V29, V31]>>>>,
// However, sub-128 vectors which need to go on the stack occupy just a
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfShortVector<CCBitConvertToType<i64>>>>,
// Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],

View File

@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
// Convert a GPR scalar to a vector by inserting it into element 0.
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
// Use a series of unpacks for extensions.
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
// Detect shifts by a scalar amount and convert them into
// V*_BY_SCALAR.
setOperationAction(ISD::SHL, VT, Custom);
@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
else if (VA.getLocInfo() == CCValAssign::Indirect)
Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
MachinePointerInfo(), false, false, false, 0);
else
else if (VA.getLocInfo() == CCValAssign::BCvt) {
// If this is a short vector argument loaded from the stack,
// extend from i64 to full vector size and then bitcast.
assert(VA.getLocVT() == MVT::i64);
assert(VA.getValVT().isVector());
Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
Value, DAG.getUNDEF(MVT::i64));
Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
} else
assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
return Value;
}
@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::AExt:
return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::BCvt:
// If this is a short vector argument to be stored to the stack,
// bitcast to v2i64 and then extract first element.
assert(VA.getLocVT() == MVT::i64);
assert(VA.getValVT().isVector());
Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
DAG.getConstant(0, DL, MVT::i32));
case CCValAssign::Full:
return Value;
default:
@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
SDValue
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const {
SDValue PackedOp = Op.getOperand(0);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
do {
FromBits *= 2;
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
SystemZ::VectorBits / FromBits);
PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
} while (FromBits != ToBits);
return PackedOp;
}
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
unsigned ByScalar) const {
// Look for cases where a vector shift can use the *_BY_SCALAR form.
@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::SIGN_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
case ISD::ZERO_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
case ISD::SHL:
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
case ISD::SRL:
@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(PERMUTE_DWORDS);
OPCODE(PERMUTE);
OPCODE(PACK);
OPCODE(UNPACK_HIGH);
OPCODE(UNPACKL_HIGH);
OPCODE(UNPACK_LOW);
OPCODE(UNPACKL_LOW);
OPCODE(VSHL_BY_SCALAR);
OPCODE(VSRL_BY_SCALAR);
OPCODE(VSRA_BY_SCALAR);
@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}
// (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF
// for v4f32.
if (Opcode == SystemZISD::MERGE_HIGH) {
if (Opcode == SystemZISD::MERGE_HIGH ||
Opcode == SystemZISD::MERGE_LOW) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0 == Op1) {
if (Op0.getOpcode() == ISD::BITCAST)
Op0 = Op0.getOperand(0);
if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
// (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
// for v4f32.
if (Op1 == N->getOperand(0))
return Op1;
// (z_merge_? 0, X) -> (z_unpackl_? 0, X).
EVT VT = Op1.getValueType();
unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
if (ElemBytes <= 4) {
Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
EVT InVT = VT.changeVectorElementTypeToInteger();
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
SystemZ::VectorBytes / ElemBytes / 2);
if (VT != InVT) {
Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
DCI.AddToWorklist(Op1.getNode());
}
SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
DCI.AddToWorklist(Op.getNode());
return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}
}
}
// If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better

View File

@ -201,6 +201,15 @@ enum {
// Pack vector operands 0 and 1 into a single vector with half-sized elements.
PACK,
// Unpack the first half of vector operand 0 into double-sized elements.
// UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
UNPACK_HIGH,
UNPACKL_HIGH,
// Likewise for the second half.
UNPACK_LOW,
UNPACKL_LOW,
// Shift each element of vector operand 0 by the number of bits specified
// by scalar operand 1.
VSHL_BY_SCALAR,
@ -306,6 +315,23 @@ public:
// want to clobber the upper 32 bits of a GPR unnecessarily.
return MVT::i32;
}
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
const override {
// Widen subvectors to the full width rather than promoting integer
// elements. This is better because:
//
// (a) it means that we can handle the ABI for passing and returning
// sub-128 vectors without having to handle them as legal types.
//
// (b) we don't have instructions to extend on load and truncate on store,
// so promoting the integers is less efficient.
//
// (c) there are no multiplication instructions for the widest integer
// type (v2i64).
if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
EVT getSetCCResultType(LLVMContext &, EVT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
@ -417,6 +443,8 @@ private:
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,

View File

@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in {
def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
// Unpack high.
def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>;
def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>;
def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>;
def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
// Unpack logical high.
def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>;
def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>;
def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>;
def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
// Unpack low.
def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, null_frag, v128h, v128b, 0>;
def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>;
def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, null_frag, v128g, v128f, 2>;
def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, z_unpack_low, v128h, v128b, 0>;
def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, z_unpack_low, v128g, v128f, 2>;
// Unpack logical low.
def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>;
def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>;
def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>;
def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
}
//===----------------------------------------------------------------------===//

View File

@ -193,6 +193,10 @@ def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
SDT_ZVecTernaryInt>;
def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
def z_unpackl_low : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR",
SDT_ZVecBinaryInt>;
def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr),
def z_vllezf32 : PatFrag<(ops node:$addr),
(bitconvert
(z_merge_high
(v2i64 (bitconvert
(z_merge_high
(v4f32 (z_vzero)),
(v2i64
(z_unpackl_high
(v4i32
(bitconvert
(v4f32 (scalar_to_vector
(f32 (load node:$addr))))))),
(f32 (load node:$addr)))))))),
(v2i64 (z_vzero))))>;
def z_vllezf64 : PatFrag<(ops node:$addr),
(z_merge_high

View File

@ -14,3 +14,17 @@ define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4
%y = sub <4 x i32> %v2, %v10
ret <4 x i32> %y
}
; This routine has 10 vector arguments, which fill up %v24-%v31 and
; the two single-wide stack slots at 160 and 168.
define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4,
<4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8,
<4 x i8> %v9, <4 x i8> %v10) {
; CHECK-LABEL: bar:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15)
; CHECK: vsb %v24, %v26, [[REG1]]
; CHECK: br %r14
%y = sub <4 x i8> %v2, %v10
ret <4 x i8> %y
}

View File

@ -0,0 +1,50 @@
; Test the handling of named short vector arguments.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
; This routine has 12 vector arguments, which fill up %v24-%v31
; and the four single-wide stack slots starting at 160.
declare void @bar(<1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>,
<1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>,
<1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>)
define void @foo() {
; CHECK-VEC-LABEL: foo:
; CHECK-VEC-DAG: vrepib %v24, 1
; CHECK-VEC-DAG: vrepib %v26, 2
; CHECK-VEC-DAG: vrepib %v28, 3
; CHECK-VEC-DAG: vrepib %v30, 4
; CHECK-VEC-DAG: vrepib %v25, 5
; CHECK-VEC-DAG: vrepib %v27, 6
; CHECK-VEC-DAG: vrepib %v29, 7
; CHECK-VEC-DAG: vrepib %v31, 8
; CHECK-VEC: brasl %r14, bar@PLT
;
; CHECK-STACK-LABEL: foo:
; CHECK-STACK: aghi %r15, -192
; CHECK-STACK-DAG: llihh [[REG1:%r[0-9]+]], 2304
; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
; CHECK-STACK-DAG: llihh [[REG2:%r[0-9]+]], 2570
; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
; CHECK-STACK-DAG: llihf [[REG3:%r[0-9]+]], 185273099
; CHECK-STACK-DAG: stg [[REG3]], 176(%r15)
; CHECK-STACK-DAG: llihf [[REG4:%r[0-9]+]], 202116108
; CHECK-STACK-DAG: oilf [[REG4]], 202116108
; CHECK-STACK-DAG: stg [[REG4]], 176(%r15)
; CHECK-STACK: brasl %r14, bar@PLT
call void @bar (<1 x i8> <i8 1>,
<2 x i8> <i8 2, i8 2>,
<4 x i8> <i8 3, i8 3, i8 3, i8 3>,
<8 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>,
<1 x i8> <i8 5>,
<2 x i8> <i8 6, i8 6>,
<4 x i8> <i8 7, i8 7, i8 7, i8 7>,
<8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>,
<1 x i8> <i8 9>,
<2 x i8> <i8 10, i8 10>,
<4 x i8> <i8 11, i8 11, i8 11, i8 11>,
<8 x i8> <i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12>)
ret void
}

View File

@ -0,0 +1,32 @@
; Test the handling of unnamed short vector arguments.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
; This routine is called with two named vector argument (passed
; in %v24 and %v26) and two unnamed vector arguments (passed
; in the single-wide stack slots at 160 and 168).
declare void @bar(<4 x i8>, <4 x i8>, ...)
define void @foo() {
; CHECK-VEC-LABEL: foo:
; CHECK-VEC-DAG: vrepib %v24, 1
; CHECK-VEC-DAG: vrepib %v26, 2
; CHECK-VEC: brasl %r14, bar@PLT
;
; CHECK-STACK-LABEL: foo:
; CHECK-STACK: aghi %r15, -176
; CHECK-STACK-DAG: llihf [[REG1:%r[0-9]+]], 50529027
; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
; CHECK-STACK-DAG: llihf [[REG2:%r[0-9]+]], 67372036
; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
; CHECK-STACK: brasl %r14, bar@PLT
call void (<4 x i8>, <4 x i8>, ...) @bar
(<4 x i8> <i8 1, i8 1, i8 1, i8 1>,
<4 x i8> <i8 2, i8 2, i8 2, i8 2>,
<4 x i8> <i8 3, i8 3, i8 3, i8 3>,
<4 x i8> <i8 4, i8 4, i8 4, i8 4>)
ret void
}

View File

@ -105,3 +105,51 @@ define i16 @f5(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) {
%res = add i16 %elem1, %elem2
ret i16 %res
}
; Test a case where an unpack high can be eliminated from the usual
; load-extend sequence.
define void @f6(<8 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
; CHECK-LABEL: f6:
; CHECK: vlrepg [[REG:%v[0-9]+]], 0(%r2)
; CHECK-NOT: vup
; CHECK-DAG: vsteb [[REG]], 0(%r3), 1
; CHECK-DAG: vsteb [[REG]], 0(%r4), 2
; CHECK-DAG: vsteb [[REG]], 0(%r5), 7
; CHECK: br %r14
%vec = load <8 x i8>, <8 x i8> *%ptr1
%ext = sext <8 x i8> %vec to <8 x i16>
%elem1 = extractelement <8 x i16> %ext, i32 1
%elem2 = extractelement <8 x i16> %ext, i32 2
%elem3 = extractelement <8 x i16> %ext, i32 7
%trunc1 = trunc i16 %elem1 to i8
%trunc2 = trunc i16 %elem2 to i8
%trunc3 = trunc i16 %elem3 to i8
store i8 %trunc1, i8 *%ptr2
store i8 %trunc2, i8 *%ptr3
store i8 %trunc3, i8 *%ptr4
ret void
}
; ...and again with a bitcast inbetween.
define void @f7(<4 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
; CHECK-LABEL: f7:
; CHECK: vlrepf [[REG:%v[0-9]+]], 0(%r2)
; CHECK-NOT: vup
; CHECK-DAG: vsteb [[REG]], 0(%r3), 0
; CHECK-DAG: vsteb [[REG]], 0(%r4), 1
; CHECK-DAG: vsteb [[REG]], 0(%r5), 3
; CHECK: br %r14
%vec = load <4 x i8>, <4 x i8> *%ptr1
%ext = sext <4 x i8> %vec to <4 x i32>
%bitcast = bitcast <4 x i32> %ext to <8 x i16>
%elem1 = extractelement <8 x i16> %bitcast, i32 1
%elem2 = extractelement <8 x i16> %bitcast, i32 3
%elem3 = extractelement <8 x i16> %bitcast, i32 7
%trunc1 = trunc i16 %elem1 to i8
%trunc2 = trunc i16 %elem2 to i8
%trunc3 = trunc i16 %elem3 to i8
store i8 %trunc1, i8 *%ptr2
store i8 %trunc2, i8 *%ptr3
store i8 %trunc3, i8 *%ptr4
ret void
}

View File

@ -0,0 +1,433 @@
; Test various representations of pack-like operations.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; One way of writing a <4 x i32> -> <8 x i16> pack.
define <8 x i16> @f1(<4 x i32> %val0, <4 x i32> %val1) {
; CHECK-LABEL: f1:
; CHECK: vpkf %v24, %v24, %v26
; CHECK: br %r14
%elem0 = extractelement <4 x i32> %val0, i32 0
%elem1 = extractelement <4 x i32> %val0, i32 1
%elem2 = extractelement <4 x i32> %val0, i32 2
%elem3 = extractelement <4 x i32> %val0, i32 3
%elem4 = extractelement <4 x i32> %val1, i32 0
%elem5 = extractelement <4 x i32> %val1, i32 1
%elem6 = extractelement <4 x i32> %val1, i32 2
%elem7 = extractelement <4 x i32> %val1, i32 3
%hboth0 = bitcast i32 %elem0 to <2 x i16>
%hboth1 = bitcast i32 %elem1 to <2 x i16>
%hboth2 = bitcast i32 %elem2 to <2 x i16>
%hboth3 = bitcast i32 %elem3 to <2 x i16>
%hboth4 = bitcast i32 %elem4 to <2 x i16>
%hboth5 = bitcast i32 %elem5 to <2 x i16>
%hboth6 = bitcast i32 %elem6 to <2 x i16>
%hboth7 = bitcast i32 %elem7 to <2 x i16>
%hlow0 = shufflevector <2 x i16> %hboth0, <2 x i16> %hboth1,
<2 x i32> <i32 1, i32 3>
%hlow1 = shufflevector <2 x i16> %hboth2, <2 x i16> %hboth3,
<2 x i32> <i32 1, i32 3>
%hlow2 = shufflevector <2 x i16> %hboth4, <2 x i16> %hboth5,
<2 x i32> <i32 1, i32 3>
%hlow3 = shufflevector <2 x i16> %hboth6, <2 x i16> %hboth7,
<2 x i32> <i32 1, i32 3>
%join0 = shufflevector <2 x i16> %hlow0, <2 x i16> %hlow1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%join1 = shufflevector <2 x i16> %hlow2, <2 x i16> %hlow3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%ret = shufflevector <4 x i16> %join0, <4 x i16> %join1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %ret
}
; A different way of writing a <4 x i32> -> <8 x i16> pack.
define <8 x i16> @f2(<4 x i32> %val0, <4 x i32> %val1) {
; CHECK-LABEL: f2:
; CHECK: vpkf %v24, %v24, %v26
; CHECK: br %r14
%elem0 = extractelement <4 x i32> %val0, i32 0
%elem1 = extractelement <4 x i32> %val0, i32 1
%elem2 = extractelement <4 x i32> %val0, i32 2
%elem3 = extractelement <4 x i32> %val0, i32 3
%elem4 = extractelement <4 x i32> %val1, i32 0
%elem5 = extractelement <4 x i32> %val1, i32 1
%elem6 = extractelement <4 x i32> %val1, i32 2
%elem7 = extractelement <4 x i32> %val1, i32 3
%wvec0 = insertelement <4 x i32> undef, i32 %elem0, i32 0
%wvec1 = insertelement <4 x i32> undef, i32 %elem1, i32 0
%wvec2 = insertelement <4 x i32> undef, i32 %elem2, i32 0
%wvec3 = insertelement <4 x i32> undef, i32 %elem3, i32 0
%wvec4 = insertelement <4 x i32> undef, i32 %elem4, i32 0
%wvec5 = insertelement <4 x i32> undef, i32 %elem5, i32 0
%wvec6 = insertelement <4 x i32> undef, i32 %elem6, i32 0
%wvec7 = insertelement <4 x i32> undef, i32 %elem7, i32 0
%hvec0 = bitcast <4 x i32> %wvec0 to <8 x i16>
%hvec1 = bitcast <4 x i32> %wvec1 to <8 x i16>
%hvec2 = bitcast <4 x i32> %wvec2 to <8 x i16>
%hvec3 = bitcast <4 x i32> %wvec3 to <8 x i16>
%hvec4 = bitcast <4 x i32> %wvec4 to <8 x i16>
%hvec5 = bitcast <4 x i32> %wvec5 to <8 x i16>
%hvec6 = bitcast <4 x i32> %wvec6 to <8 x i16>
%hvec7 = bitcast <4 x i32> %wvec7 to <8 x i16>
%hlow0 = shufflevector <8 x i16> %hvec0, <8 x i16> %hvec1,
<8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%hlow1 = shufflevector <8 x i16> %hvec2, <8 x i16> %hvec3,
<8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%hlow2 = shufflevector <8 x i16> %hvec4, <8 x i16> %hvec5,
<8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%hlow3 = shufflevector <8 x i16> %hvec6, <8 x i16> %hvec7,
<8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%join0 = shufflevector <8 x i16> %hlow0, <8 x i16> %hlow1,
<8 x i32> <i32 0, i32 1, i32 8, i32 9,
i32 undef, i32 undef, i32 undef, i32 undef>
%join1 = shufflevector <8 x i16> %hlow2, <8 x i16> %hlow3,
<8 x i32> <i32 0, i32 1, i32 8, i32 9,
i32 undef, i32 undef, i32 undef, i32 undef>
%ret = shufflevector <8 x i16> %join0, <8 x i16> %join1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 8, i32 9, i32 10, i32 11>
ret <8 x i16> %ret
}
; A direct pack operation.
define <8 x i16> @f3(<4 x i32> %val0, <4 x i32> %val1) {
; CHECK-LABEL: f3:
; CHECK: vpkf %v24, %v24, %v26
; CHECK: br %r14
%bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
%bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
%ret = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
ret <8 x i16> %ret
}
; One way of writing a <4 x i32> -> <16 x i8> pack. It doesn't matter
; whether the first pack is VPKF or VPKH since the even bytes of the
; result are discarded.
define <16 x i8> @f4(<4 x i32> %val0, <4 x i32> %val1,
<4 x i32> %val2, <4 x i32> %val3) {
; CHECK-LABEL: f4:
; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
; CHECK: vpkh %v24, [[REG1]], [[REG2]]
; CHECK: br %r14
%bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
%bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
%bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
%bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
%join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
%join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
%bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
%bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
%ret = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
<16 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15,
i32 17, i32 19, i32 21, i32 23,
i32 25, i32 27, i32 29, i32 31>
ret <16 x i8> %ret
}
; Check the same operation, but with elements being extracted from the result.
define void @f5(<4 x i32> %val0, <4 x i32> %val1,
<4 x i32> %val2, <4 x i32> %val3,
i8 *%base) {
; CHECK-LABEL: f5:
; CHECK-DAG: vsteb %v24, 0(%r2), 11
; CHECK-DAG: vsteb %v26, 1(%r2), 15
; CHECK-DAG: vsteb %v28, 2(%r2), 3
; CHECK-DAG: vsteb %v30, 3(%r2), 7
; CHECK: br %r14
%bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
%bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
%bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
%bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
%join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
%join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
%bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
%bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
%vec = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
<16 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15,
i32 17, i32 19, i32 21, i32 23,
i32 25, i32 27, i32 29, i32 31>
%ptr0 = getelementptr i8, i8 *%base, i64 0
%ptr1 = getelementptr i8, i8 *%base, i64 1
%ptr2 = getelementptr i8, i8 *%base, i64 2
%ptr3 = getelementptr i8, i8 *%base, i64 3
%byte0 = extractelement <16 x i8> %vec, i32 2
%byte1 = extractelement <16 x i8> %vec, i32 7
%byte2 = extractelement <16 x i8> %vec, i32 8
%byte3 = extractelement <16 x i8> %vec, i32 13
store i8 %byte0, i8 *%ptr0
store i8 %byte1, i8 *%ptr1
store i8 %byte2, i8 *%ptr2
store i8 %byte3, i8 *%ptr3
ret void
}
; A different way of writing a <4 x i32> -> <16 x i8> pack.
define <16 x i8> @f6(<4 x i32> %val0, <4 x i32> %val1,
<4 x i32> %val2, <4 x i32> %val3) {
; CHECK-LABEL: f6:
; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
; CHECK: vpkh %v24, [[REG1]], [[REG2]]
; CHECK: br %r14
%elem0 = extractelement <4 x i32> %val0, i32 0
%elem1 = extractelement <4 x i32> %val0, i32 1
%elem2 = extractelement <4 x i32> %val0, i32 2
%elem3 = extractelement <4 x i32> %val0, i32 3
%elem4 = extractelement <4 x i32> %val1, i32 0
%elem5 = extractelement <4 x i32> %val1, i32 1
%elem6 = extractelement <4 x i32> %val1, i32 2
%elem7 = extractelement <4 x i32> %val1, i32 3
%elem8 = extractelement <4 x i32> %val2, i32 0
%elem9 = extractelement <4 x i32> %val2, i32 1
%elem10 = extractelement <4 x i32> %val2, i32 2
%elem11 = extractelement <4 x i32> %val2, i32 3
%elem12 = extractelement <4 x i32> %val3, i32 0
%elem13 = extractelement <4 x i32> %val3, i32 1
%elem14 = extractelement <4 x i32> %val3, i32 2
%elem15 = extractelement <4 x i32> %val3, i32 3
%bitcast0 = bitcast i32 %elem0 to <2 x i16>
%bitcast1 = bitcast i32 %elem1 to <2 x i16>
%bitcast2 = bitcast i32 %elem2 to <2 x i16>
%bitcast3 = bitcast i32 %elem3 to <2 x i16>
%bitcast4 = bitcast i32 %elem4 to <2 x i16>
%bitcast5 = bitcast i32 %elem5 to <2 x i16>
%bitcast6 = bitcast i32 %elem6 to <2 x i16>
%bitcast7 = bitcast i32 %elem7 to <2 x i16>
%bitcast8 = bitcast i32 %elem8 to <2 x i16>
%bitcast9 = bitcast i32 %elem9 to <2 x i16>
%bitcast10 = bitcast i32 %elem10 to <2 x i16>
%bitcast11 = bitcast i32 %elem11 to <2 x i16>
%bitcast12 = bitcast i32 %elem12 to <2 x i16>
%bitcast13 = bitcast i32 %elem13 to <2 x i16>
%bitcast14 = bitcast i32 %elem14 to <2 x i16>
%bitcast15 = bitcast i32 %elem15 to <2 x i16>
%low0 = shufflevector <2 x i16> %bitcast0, <2 x i16> %bitcast1,
<2 x i32> <i32 1, i32 3>
%low1 = shufflevector <2 x i16> %bitcast2, <2 x i16> %bitcast3,
<2 x i32> <i32 1, i32 3>
%low2 = shufflevector <2 x i16> %bitcast4, <2 x i16> %bitcast5,
<2 x i32> <i32 1, i32 3>
%low3 = shufflevector <2 x i16> %bitcast6, <2 x i16> %bitcast7,
<2 x i32> <i32 1, i32 3>
%low4 = shufflevector <2 x i16> %bitcast8, <2 x i16> %bitcast9,
<2 x i32> <i32 1, i32 3>
%low5 = shufflevector <2 x i16> %bitcast10, <2 x i16> %bitcast11,
<2 x i32> <i32 1, i32 3>
%low6 = shufflevector <2 x i16> %bitcast12, <2 x i16> %bitcast13,
<2 x i32> <i32 1, i32 3>
%low7 = shufflevector <2 x i16> %bitcast14, <2 x i16> %bitcast15,
<2 x i32> <i32 1, i32 3>
%bytes0 = bitcast <2 x i16> %low0 to <4 x i8>
%bytes1 = bitcast <2 x i16> %low1 to <4 x i8>
%bytes2 = bitcast <2 x i16> %low2 to <4 x i8>
%bytes3 = bitcast <2 x i16> %low3 to <4 x i8>
%bytes4 = bitcast <2 x i16> %low4 to <4 x i8>
%bytes5 = bitcast <2 x i16> %low5 to <4 x i8>
%bytes6 = bitcast <2 x i16> %low6 to <4 x i8>
%bytes7 = bitcast <2 x i16> %low7 to <4 x i8>
%blow0 = shufflevector <4 x i8> %bytes0, <4 x i8> %bytes1,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%blow1 = shufflevector <4 x i8> %bytes2, <4 x i8> %bytes3,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%blow2 = shufflevector <4 x i8> %bytes4, <4 x i8> %bytes5,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%blow3 = shufflevector <4 x i8> %bytes6, <4 x i8> %bytes7,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%join0 = shufflevector <4 x i8> %blow0, <4 x i8> %blow1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
%join1 = shufflevector <4 x i8> %blow2, <4 x i8> %blow3,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7>
%ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11,
i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %ret
}
; One way of writing a <2 x i64> -> <16 x i8> pack.
define <16 x i8> @f7(<2 x i64> %val0, <2 x i64> %val1,
<2 x i64> %val2, <2 x i64> %val3,
<2 x i64> %val4, <2 x i64> %val5,
<2 x i64> %val6, <2 x i64> %val7) {
; CHECK-LABEL: f7:
; CHECK-DAG: vpk{{[hfg]}} [[REG1:%v[0-9]+]], %v24, %v26
; CHECK-DAG: vpk{{[hfg]}} [[REG2:%v[0-9]+]], %v28, %v30
; CHECK-DAG: vpk{{[hfg]}} [[REG3:%v[0-9]+]], %v25, %v27
; CHECK-DAG: vpk{{[hfg]}} [[REG4:%v[0-9]+]], %v29, %v31
; CHECK-DAG: vpk{{[hf]}} [[REG5:%v[0-9]+]], [[REG1]], [[REG2]]
; CHECK-DAG: vpk{{[hf]}} [[REG6:%v[0-9]+]], [[REG3]], [[REG4]]
; CHECK: vpkh %v24, [[REG5]], [[REG6]]
; CHECK: br %r14
%elem0 = extractelement <2 x i64> %val0, i32 0
%elem1 = extractelement <2 x i64> %val0, i32 1
%elem2 = extractelement <2 x i64> %val1, i32 0
%elem3 = extractelement <2 x i64> %val1, i32 1
%elem4 = extractelement <2 x i64> %val2, i32 0
%elem5 = extractelement <2 x i64> %val2, i32 1
%elem6 = extractelement <2 x i64> %val3, i32 0
%elem7 = extractelement <2 x i64> %val3, i32 1
%elem8 = extractelement <2 x i64> %val4, i32 0
%elem9 = extractelement <2 x i64> %val4, i32 1
%elem10 = extractelement <2 x i64> %val5, i32 0
%elem11 = extractelement <2 x i64> %val5, i32 1
%elem12 = extractelement <2 x i64> %val6, i32 0
%elem13 = extractelement <2 x i64> %val6, i32 1
%elem14 = extractelement <2 x i64> %val7, i32 0
%elem15 = extractelement <2 x i64> %val7, i32 1
%bitcast0 = bitcast i64 %elem0 to <2 x i32>
%bitcast1 = bitcast i64 %elem1 to <2 x i32>
%bitcast2 = bitcast i64 %elem2 to <2 x i32>
%bitcast3 = bitcast i64 %elem3 to <2 x i32>
%bitcast4 = bitcast i64 %elem4 to <2 x i32>
%bitcast5 = bitcast i64 %elem5 to <2 x i32>
%bitcast6 = bitcast i64 %elem6 to <2 x i32>
%bitcast7 = bitcast i64 %elem7 to <2 x i32>
%bitcast8 = bitcast i64 %elem8 to <2 x i32>
%bitcast9 = bitcast i64 %elem9 to <2 x i32>
%bitcast10 = bitcast i64 %elem10 to <2 x i32>
%bitcast11 = bitcast i64 %elem11 to <2 x i32>
%bitcast12 = bitcast i64 %elem12 to <2 x i32>
%bitcast13 = bitcast i64 %elem13 to <2 x i32>
%bitcast14 = bitcast i64 %elem14 to <2 x i32>
%bitcast15 = bitcast i64 %elem15 to <2 x i32>
%low0 = shufflevector <2 x i32> %bitcast0, <2 x i32> %bitcast1,
<2 x i32> <i32 1, i32 3>
%low1 = shufflevector <2 x i32> %bitcast2, <2 x i32> %bitcast3,
<2 x i32> <i32 1, i32 3>
%low2 = shufflevector <2 x i32> %bitcast4, <2 x i32> %bitcast5,
<2 x i32> <i32 1, i32 3>
%low3 = shufflevector <2 x i32> %bitcast6, <2 x i32> %bitcast7,
<2 x i32> <i32 1, i32 3>
%low4 = shufflevector <2 x i32> %bitcast8, <2 x i32> %bitcast9,
<2 x i32> <i32 1, i32 3>
%low5 = shufflevector <2 x i32> %bitcast10, <2 x i32> %bitcast11,
<2 x i32> <i32 1, i32 3>
%low6 = shufflevector <2 x i32> %bitcast12, <2 x i32> %bitcast13,
<2 x i32> <i32 1, i32 3>
%low7 = shufflevector <2 x i32> %bitcast14, <2 x i32> %bitcast15,
<2 x i32> <i32 1, i32 3>
%half0 = bitcast <2 x i32> %low0 to <4 x i16>
%half1 = bitcast <2 x i32> %low1 to <4 x i16>
%half2 = bitcast <2 x i32> %low2 to <4 x i16>
%half3 = bitcast <2 x i32> %low3 to <4 x i16>
%half4 = bitcast <2 x i32> %low4 to <4 x i16>
%half5 = bitcast <2 x i32> %low5 to <4 x i16>
%half6 = bitcast <2 x i32> %low6 to <4 x i16>
%half7 = bitcast <2 x i32> %low7 to <4 x i16>
%hlow0 = shufflevector <4 x i16> %half0, <4 x i16> %half1,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%hlow1 = shufflevector <4 x i16> %half2, <4 x i16> %half3,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%hlow2 = shufflevector <4 x i16> %half4, <4 x i16> %half5,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%hlow3 = shufflevector <4 x i16> %half6, <4 x i16> %half7,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%bytes0 = bitcast <4 x i16> %hlow0 to <8 x i8>
%bytes1 = bitcast <4 x i16> %hlow1 to <8 x i8>
%bytes2 = bitcast <4 x i16> %hlow2 to <8 x i8>
%bytes3 = bitcast <4 x i16> %hlow3 to <8 x i8>
%join0 = shufflevector <8 x i8> %bytes0, <8 x i8> %bytes1,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
%join1 = shufflevector <8 x i8> %bytes2, <8 x i8> %bytes3,
<8 x i32> <i32 1, i32 3, i32 5, i32 7,
i32 9, i32 11, i32 13, i32 15>
%ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11,
i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %ret
}
; Test a <2 x i64> -> <4 x f32> pack in which only individual elements are
; needed.
define float @f8(i64 %scalar0, i64 %scalar1, i64 %scalar2, i64 %scalar3) {
; CHECK-LABEL: f8:
; CHECK-NOT: vperm
; CHECK-NOT: vpk
; CHECK-NOT: vmrh
; CHECK: aebr {{%f[0-7]}},
; CHECK: aebr {{%f[0-7]}},
; CHECK: meebr %f0,
; CHECK: br %r14
%vec0 = insertelement <2 x i64> undef, i64 %scalar0, i32 0
%vec1 = insertelement <2 x i64> undef, i64 %scalar1, i32 0
%vec2 = insertelement <2 x i64> undef, i64 %scalar2, i32 0
%vec3 = insertelement <2 x i64> undef, i64 %scalar3, i32 0
%join0 = shufflevector <2 x i64> %vec0, <2 x i64> %vec1,
<2 x i32> <i32 0, i32 2>
%join1 = shufflevector <2 x i64> %vec2, <2 x i64> %vec3,
<2 x i32> <i32 0, i32 2>
%bitcast0 = bitcast <2 x i64> %join0 to <4 x float>
%bitcast1 = bitcast <2 x i64> %join1 to <4 x float>
%pack = shufflevector <4 x float> %bitcast0, <4 x float> %bitcast1,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%elt0 = extractelement <4 x float> %pack, i32 0
%elt1 = extractelement <4 x float> %pack, i32 1
%elt2 = extractelement <4 x float> %pack, i32 2
%elt3 = extractelement <4 x float> %pack, i32 3
%add0 = fadd float %elt0, %elt2
%add1 = fadd float %elt1, %elt3
%ret = fmul float %add0, %add1
ret float %ret
}
; Test a <2 x f64> -> <4 x i32> pack in which only individual elements are
; needed.
define i32 @f9(double %scalar0, double %scalar1, double %scalar2,
double %scalar3) {
; CHECK-LABEL: f9:
; CHECK-NOT: vperm
; CHECK-NOT: vpk
; CHECK-NOT: vmrh
; CHECK: ar {{%r[0-5]}},
; CHECK: ar {{%r[0-5]}},
; CHECK: or %r2,
; CHECK: br %r14
%vec0 = insertelement <2 x double> undef, double %scalar0, i32 0
%vec1 = insertelement <2 x double> undef, double %scalar1, i32 0
%vec2 = insertelement <2 x double> undef, double %scalar2, i32 0
%vec3 = insertelement <2 x double> undef, double %scalar3, i32 0
%join0 = shufflevector <2 x double> %vec0, <2 x double> %vec1,
<2 x i32> <i32 0, i32 2>
%join1 = shufflevector <2 x double> %vec2, <2 x double> %vec3,
<2 x i32> <i32 0, i32 2>
%bitcast0 = bitcast <2 x double> %join0 to <4 x i32>
%bitcast1 = bitcast <2 x double> %join1 to <4 x i32>
%pack = shufflevector <4 x i32> %bitcast0, <4 x i32> %bitcast1,
<4 x i32> <i32 1, i32 3, i32 5, i32 7>
%elt0 = extractelement <4 x i32> %pack, i32 0
%elt1 = extractelement <4 x i32> %pack, i32 1
%elt2 = extractelement <4 x i32> %pack, i32 2
%elt3 = extractelement <4 x i32> %pack, i32 3
%add0 = add i32 %elt0, %elt2
%add1 = add i32 %elt1, %elt3
%ret = or i32 %add0, %add1
ret i32 %ret
}

View File

@ -53,3 +53,51 @@ define <16 x i8> @f5() {
i8 0, i8 -1, i8 -1, i8 -1,
i8 0, i8 -1, i8 0, i8 -1>
}
; Test an all-zeros v2i8 that gets promoted to v16i8.
define <2 x i8> @f6() {
; CHECK-LABEL: f6:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <2 x i8> zeroinitializer
}
; Test a mixed v2i8 that gets promoted to v16i8 (mask 0x8000).
define <2 x i8> @f7() {
; CHECK-LABEL: f7:
; CHECK: vgbm %v24, 32768
; CHECK: br %r14
ret <2 x i8> <i8 255, i8 0>
}
; Test an all-zeros v4i8 that gets promoted to v16i8.
define <4 x i8> @f8() {
; CHECK-LABEL: f8:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <4 x i8> zeroinitializer
}
; Test a mixed v4i8 that gets promoted to v16i8 (mask 0x9000).
define <4 x i8> @f9() {
; CHECK-LABEL: f9:
; CHECK: vgbm %v24, 36864
; CHECK: br %r14
ret <4 x i8> <i8 255, i8 0, i8 0, i8 255>
}
; Test an all-zeros v8i8 that gets promoted to v16i8.
define <8 x i8> @f10() {
; CHECK-LABEL: f10:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <8 x i8> zeroinitializer
}
; Test a mixed v8i8 that gets promoted to v16i8 (mask 0xE500).
define <8 x i8> @f11() {
; CHECK-LABEL: f11:
; CHECK: vgbm %v24, 58624
; CHECK: br %r14
ret <8 x i8> <i8 255, i8 255, i8 255, i8 0, i8 0, i8 255, i8 0, i8 255>
}

View File

@ -45,3 +45,35 @@ define <8 x i16> @f5() {
ret <8 x i16> <i16 65280, i16 0, i16 65535, i16 0,
i16 255, i16 65535, i16 256, i16 65280>
}
; Test an all-zeros v2i16 that gets promoted to v8i16.
define <2 x i16> @f6() {
; CHECK-LABEL: f6:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <2 x i16> zeroinitializer
}
; Test a mixed v2i16 that gets promoted to v8i16 (mask 0xc000).
define <2 x i16> @f7() {
; CHECK-LABEL: f7:
; CHECK: vgbm %v24, 49152
; CHECK: br %r14
ret <2 x i16> <i16 65535, i16 0>
}
; Test an all-zeros v4i16 that gets promoted to v8i16.
define <4 x i16> @f8() {
; CHECK-LABEL: f8:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <4 x i16> zeroinitializer
}
; Test a mixed v4i16 that gets promoted to v8i16 (mask 0x7200).
define <4 x i16> @f9() {
; CHECK-LABEL: f9:
; CHECK: vgbm %v24, 29184
; CHECK: br %r14
ret <4 x i16> <i16 255, i16 65535, i16 0, i16 65280>
}

View File

@ -41,3 +41,19 @@ define <4 x i32> @f5() {
; CHECK: br %r14
ret <4 x i32> <i32 4278190080, i32 1, i32 16777215, i32 16776960>
}
; Test an all-zeros v2i32 that gets promoted to v4i32.
define <2 x i32> @f6() {
; CHECK-LABEL: f6:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <2 x i32> zeroinitializer
}
; Test a mixed v2i32 that gets promoted to v4i32 (mask 0xae00).
define <2 x i32> @f7() {
; CHECK-LABEL: f7:
; CHECK: vgbm %v24, 44544
; CHECK: br %r14
ret <2 x i32> <i32 4278255360, i32 -256>
}

View File

@ -45,3 +45,19 @@ define <4 x float> @f5() {
ret <4 x float> <float 0xffffe00000000000, float 0x381fffffc0000000,
float 0x379fffe000000000, float 0x371fe00000000000>
}
; Test an all-zeros v2f32 that gets promoted to v4f32.
define <2 x float> @f6() {
; CHECK-LABEL: f6:
; CHECK: vgbm %v24, 0
; CHECK: br %r14
ret <2 x float> zeroinitializer
}
; Test a mixed v2f32 that gets promoted to v4f32 (mask 0xc700).
define <2 x float> @f7() {
; CHECK-LABEL: f7:
; CHECK: vgbm %v24, 50944
; CHECK: br %r14
ret <2 x float> <float 0xffffe00000000000, float 0x381fffffe0000000>
}

View File

@ -49,3 +49,59 @@ define <2 x double> @f6(<2 x double> %val1, <2 x double> %val2) {
; CHECK: br %r14
ret <2 x double> %val2
}
; Test v2i8 moves.
define <2 x i8> @f7(<2 x i8> %val1, <2 x i8> %val2) {
; CHECK-LABEL: f7:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <2 x i8> %val2
}
; Test v4i8 moves.
define <4 x i8> @f8(<4 x i8> %val1, <4 x i8> %val2) {
; CHECK-LABEL: f8:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <4 x i8> %val2
}
; Test v8i8 moves.
define <8 x i8> @f9(<8 x i8> %val1, <8 x i8> %val2) {
; CHECK-LABEL: f9:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <8 x i8> %val2
}
; Test v2i16 moves.
define <2 x i16> @f10(<2 x i16> %val1, <2 x i16> %val2) {
; CHECK-LABEL: f10:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <2 x i16> %val2
}
; Test v4i16 moves.
define <4 x i16> @f11(<4 x i16> %val1, <4 x i16> %val2) {
; CHECK-LABEL: f11:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <4 x i16> %val2
}
; Test v2i32 moves.
define <2 x i32> @f12(<2 x i32> %val1, <2 x i32> %val2) {
; CHECK-LABEL: f12:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <2 x i32> %val2
}
; Test v2f32 moves.
define <2 x float> @f13(<2 x float> %val1, <2 x float> %val2) {
; CHECK-LABEL: f13:
; CHECK: vlr %v24, %v26
; CHECK: br %r14
ret <2 x float> %val2
}

View File

@ -49,8 +49,8 @@ define <2 x i64> @f4(i64 %val) {
; Test v4f32 insertion into 0.
define <4 x float> @f5(float %val) {
; CHECK-LABEL: f5:
; CHECK: vgbm [[ZERO:%v[0-9]+]], 0
; CHECK: vmrhf [[REG:%v[0-9]+]], [[ZERO]], %v0
; CHECK-DAG: vuplhf [[REG:%v[0-9]+]], %v0
; CHECK-DAG: vgbm [[ZERO:%v[0-9]+]], 0
; CHECK: vmrhg %v24, [[ZERO]], [[REG]]
; CHECK: br %r14
%ret = insertelement <4 x float> zeroinitializer, float %val, i32 3

View File

@ -0,0 +1,106 @@
; Test vector sign-extending loads.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test a v16i1->v16i8 extension.
define <16 x i8> @f1(<16 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <16 x i1>, <16 x i1> *%ptr
%ret = sext <16 x i1> %val to <16 x i8>
ret <16 x i8> %ret
}
; Test a v8i1->v8i16 extension.
define <8 x i16> @f2(<8 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <8 x i1>, <8 x i1> *%ptr
%ret = sext <8 x i1> %val to <8 x i16>
ret <8 x i16> %ret
}
; Test a v8i8->v8i16 extension.
define <8 x i16> @f3(<8 x i8> *%ptr) {
; CHECK-LABEL: f3:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuphb %v24, [[REG1]]
; CHECK: br %r14
%val = load <8 x i8>, <8 x i8> *%ptr
%ret = sext <8 x i8> %val to <8 x i16>
ret <8 x i16> %ret
}
; Test a v4i1->v4i32 extension.
define <4 x i32> @f4(<4 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <4 x i1>, <4 x i1> *%ptr
%ret = sext <4 x i1> %val to <4 x i32>
ret <4 x i32> %ret
}
; Test a v4i8->v4i32 extension.
define <4 x i32> @f5(<4 x i8> *%ptr) {
; CHECK-LABEL: f5:
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuphh %v24, [[REG2]]
; CHECK: br %r14
%val = load <4 x i8>, <4 x i8> *%ptr
%ret = sext <4 x i8> %val to <4 x i32>
ret <4 x i32> %ret
}
; Test a v4i16->v4i32 extension.
define <4 x i32> @f6(<4 x i16> *%ptr) {
; CHECK-LABEL: f6:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuphh %v24, [[REG1]]
; CHECK: br %r14
%val = load <4 x i16>, <4 x i16> *%ptr
%ret = sext <4 x i16> %val to <4 x i32>
ret <4 x i32> %ret
}
; Test a v2i1->v2i64 extension.
define <2 x i64> @f7(<2 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <2 x i1>, <2 x i1> *%ptr
%ret = sext <2 x i1> %val to <2 x i64>
ret <2 x i64> %ret
}
; Test a v2i8->v2i64 extension.
define <2 x i64> @f8(<2 x i8> *%ptr) {
; CHECK-LABEL: f8:
; CHECK: vlrepb [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vleb [[REG1]], 1(%r2), 1
; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuphh [[REG3:%v[0-9]+]], [[REG2]]
; CHECK: vuphf %v24, [[REG3]]
; CHECK: br %r14
%val = load <2 x i8>, <2 x i8> *%ptr
%ret = sext <2 x i8> %val to <2 x i64>
ret <2 x i64> %ret
}
; Test a v2i16->v2i64 extension.
define <2 x i64> @f9(<2 x i16> *%ptr) {
; CHECK-LABEL: f9:
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuphh [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuphf %v24, [[REG2]]
; CHECK: br %r14
%val = load <2 x i16>, <2 x i16> *%ptr
%ret = sext <2 x i16> %val to <2 x i64>
ret <2 x i64> %ret
}
; Test a v2i32->v2i64 extension.
define <2 x i64> @f10(<2 x i32> *%ptr) {
; CHECK-LABEL: f10:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuphf %v24, [[REG1]]
; CHECK: br %r14
%val = load <2 x i32>, <2 x i32> *%ptr
%ret = sext <2 x i32> %val to <2 x i64>
ret <2 x i64> %ret
}

View File

@ -0,0 +1,106 @@
; Test vector zero-extending loads.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test a v16i1->v16i8 extension.
define <16 x i8> @f1(<16 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <16 x i1>, <16 x i1> *%ptr
%ret = zext <16 x i1> %val to <16 x i8>
ret <16 x i8> %ret
}
; Test a v8i1->v8i16 extension.
define <8 x i16> @f2(<8 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <8 x i1>, <8 x i1> *%ptr
%ret = zext <8 x i1> %val to <8 x i16>
ret <8 x i16> %ret
}
; Test a v8i8->v8i16 extension.
define <8 x i16> @f3(<8 x i8> *%ptr) {
; CHECK-LABEL: f3:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhb %v24, [[REG1]]
; CHECK: br %r14
%val = load <8 x i8>, <8 x i8> *%ptr
%ret = zext <8 x i8> %val to <8 x i16>
ret <8 x i16> %ret
}
; Test a v4i1->v4i32 extension.
define <4 x i32> @f4(<4 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <4 x i1>, <4 x i1> *%ptr
%ret = zext <4 x i1> %val to <4 x i32>
ret <4 x i32> %ret
}
; Test a v4i8->v4i32 extension.
define <4 x i32> @f5(<4 x i8> *%ptr) {
; CHECK-LABEL: f5:
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuplhh %v24, [[REG2]]
; CHECK: br %r14
%val = load <4 x i8>, <4 x i8> *%ptr
%ret = zext <4 x i8> %val to <4 x i32>
ret <4 x i32> %ret
}
; Test a v4i16->v4i32 extension.
define <4 x i32> @f6(<4 x i16> *%ptr) {
; CHECK-LABEL: f6:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhh %v24, [[REG1]]
; CHECK: br %r14
%val = load <4 x i16>, <4 x i16> *%ptr
%ret = zext <4 x i16> %val to <4 x i32>
ret <4 x i32> %ret
}
; Test a v2i1->v2i64 extension.
define <2 x i64> @f7(<2 x i1> *%ptr) {
; No expected output, but must compile.
%val = load <2 x i1>, <2 x i1> *%ptr
%ret = zext <2 x i1> %val to <2 x i64>
ret <2 x i64> %ret
}
; Test a v2i8->v2i64 extension.
define <2 x i64> @f8(<2 x i8> *%ptr) {
; CHECK-LABEL: f8:
; CHECK: vlrepb [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vleb [[REG1]], 1(%r2), 1
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
; CHECK: vuplhf %v24, [[REG3]]
; CHECK: br %r14
%val = load <2 x i8>, <2 x i8> *%ptr
%ret = zext <2 x i8> %val to <2 x i64>
ret <2 x i64> %ret
}
; Test a v2i16->v2i64 extension.
define <2 x i64> @f9(<2 x i16> *%ptr) {
; CHECK-LABEL: f9:
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuplhf %v24, [[REG2]]
; CHECK: br %r14
%val = load <2 x i16>, <2 x i16> *%ptr
%ret = zext <2 x i16> %val to <2 x i64>
ret <2 x i64> %ret
}
; Test a v2i32->v2i64 extension.
define <2 x i64> @f10(<2 x i32> *%ptr) {
; CHECK-LABEL: f10:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhf %v24, [[REG1]]
; CHECK: br %r14
%val = load <2 x i32>, <2 x i32> *%ptr
%ret = zext <2 x i32> %val to <2 x i64>
ret <2 x i64> %ret
}

View File

@ -85,3 +85,64 @@ define double @f7(<2 x double> %val1, <2 x double> %val2) {
%ret = fsub double %scalar1, %scalar2
ret double %ret
}
; Test a v2i8 subtraction, which gets promoted to v16i8.
define <2 x i8> @f8(<2 x i8> %dummy, <2 x i8> %val1, <2 x i8> %val2) {
; CHECK-LABEL: f8:
; CHECK: vsb %v24, %v26, %v28
; CHECK: br %r14
%ret = sub <2 x i8> %val1, %val2
ret <2 x i8> %ret
}
; Test a v4i8 subtraction, which gets promoted to v16i8.
define <4 x i8> @f9(<4 x i8> %dummy, <4 x i8> %val1, <4 x i8> %val2) {
; CHECK-LABEL: f9:
; CHECK: vsb %v24, %v26, %v28
; CHECK: br %r14
%ret = sub <4 x i8> %val1, %val2
ret <4 x i8> %ret
}
; Test a v8i8 subtraction, which gets promoted to v16i8.
define <8 x i8> @f10(<8 x i8> %dummy, <8 x i8> %val1, <8 x i8> %val2) {
; CHECK-LABEL: f10:
; CHECK: vsb %v24, %v26, %v28
; CHECK: br %r14
%ret = sub <8 x i8> %val1, %val2
ret <8 x i8> %ret
}
; Test a v2i16 subtraction, which gets promoted to v8i16.
define <2 x i16> @f11(<2 x i16> %dummy, <2 x i16> %val1, <2 x i16> %val2) {
; CHECK-LABEL: f11:
; CHECK: vsh %v24, %v26, %v28
; CHECK: br %r14
%ret = sub <2 x i16> %val1, %val2
ret <2 x i16> %ret
}
; Test a v4i16 subtraction, which gets promoted to v8i16.
define <4 x i16> @f12(<4 x i16> %dummy, <4 x i16> %val1, <4 x i16> %val2) {
; CHECK-LABEL: f12:
; CHECK: vsh %v24, %v26, %v28
; CHECK: br %r14
%ret = sub <4 x i16> %val1, %val2
ret <4 x i16> %ret
}
; Test a v2i32 subtraction, which gets promoted to v4i32.
define <2 x i32> @f13(<2 x i32> %dummy, <2 x i32> %val1, <2 x i32> %val2) {
; CHECK-LABEL: f13:
; CHECK: vsf %v24, %v26, %v28
; CHECK: br %r14
%ret = sub <2 x i32> %val1, %val2
ret <2 x i32> %ret
}
; Test a v2f32 subtraction, which gets promoted to v4f32.
define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) {
; No particular output expected, but must compile.
%ret = fsub <2 x float> %val1, %val2
ret <2 x float> %ret
}