1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[VE] VEC_BROADCAST, lowering and isel

This defines the vec_broadcast SDNode along with lowering and isel code.
We also remove unused type mappings for the vector register classes (all vector MVTs that are not used in the ISA go).

We will implement support for short vectors later by intercepting nodes with illegal vector EVTs before LLVM has had a chance to widen them.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D91646
This commit is contained in:
Simon Moll 2020-11-19 09:44:48 +01:00
parent 1e4ea2938d
commit 8a479552ad
7 changed files with 426 additions and 65 deletions

View File

@ -103,14 +103,7 @@ def RetCC_VE_C : CallingConv<[
// handled conforming to the standard cc.
def CC_VE_Fast : CallingConv<[
// vector --> generic vector registers
CCIfType<[v2i32, v2i64, v2f32, v2f64,
v4i32, v4i64, v4f32, v4f64,
v8i32, v8i64, v8f32, v8f64,
v16i32, v16i64, v16f32, v16f64,
v32i32, v32i64, v32f32, v32f64,
v64i32, v64i64, v64f32, v64f64,
v128i32, v128i64, v128f32, v128f64,
v256i32, v256f32, v256i64, v256f64],
CCIfType<[v256i32, v256f32, v256i64, v256f64],
CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
// TODO: make this conditional on packed mode
CCIfType<[v512i32, v512f32],
@ -131,14 +124,7 @@ def CC_VE_Fast : CallingConv<[
def RetCC_VE_Fast : CallingConv<[
// vector --> generic vector registers
CCIfType<[v2i32, v2i64, v2f32, v2f64,
v4i32, v4i64, v4f32, v4f64,
v8i32, v8i64, v8f32, v8f64,
v16i32, v16i64, v16f32, v16f64,
v32i32, v32i64, v32f32, v32f64,
v64i32, v64i64, v64f32, v64f64,
v128i32, v128i64, v128f32, v128f64,
v256i32, v256f32, v256i64, v256f64],
CCIfType<[v256i32, v256f32, v256i64, v256f64],
CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
// TODO: make this conditional on packed mode
CCIfType<[v512i32, v512f32],

View File

@ -70,6 +70,11 @@ bool VETargetLowering::CanLowerReturn(
return CCInfo.CheckReturn(Outs, RetCC);
}
static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
MVT::v256f32, MVT::v512f32, MVT::v256f64};
static const MVT AllMaskVTs[] = {MVT::v256i1, MVT::v512i1};
void VETargetLowering::initRegisterClasses() {
// Set up the register classes.
addRegisterClass(MVT::i32, &VE::I32RegClass);
@ -79,46 +84,10 @@ void VETargetLowering::initRegisterClasses() {
addRegisterClass(MVT::f128, &VE::F128RegClass);
if (Subtarget->enableVPU()) {
addRegisterClass(MVT::v2i32, &VE::V64RegClass);
addRegisterClass(MVT::v4i32, &VE::V64RegClass);
addRegisterClass(MVT::v8i32, &VE::V64RegClass);
addRegisterClass(MVT::v16i32, &VE::V64RegClass);
addRegisterClass(MVT::v32i32, &VE::V64RegClass);
addRegisterClass(MVT::v64i32, &VE::V64RegClass);
addRegisterClass(MVT::v128i32, &VE::V64RegClass);
addRegisterClass(MVT::v256i32, &VE::V64RegClass);
addRegisterClass(MVT::v512i32, &VE::V64RegClass);
addRegisterClass(MVT::v2i64, &VE::V64RegClass);
addRegisterClass(MVT::v4i64, &VE::V64RegClass);
addRegisterClass(MVT::v8i64, &VE::V64RegClass);
addRegisterClass(MVT::v16i64, &VE::V64RegClass);
addRegisterClass(MVT::v32i64, &VE::V64RegClass);
addRegisterClass(MVT::v64i64, &VE::V64RegClass);
addRegisterClass(MVT::v128i64, &VE::V64RegClass);
addRegisterClass(MVT::v256i64, &VE::V64RegClass);
addRegisterClass(MVT::v2f32, &VE::V64RegClass);
addRegisterClass(MVT::v4f32, &VE::V64RegClass);
addRegisterClass(MVT::v8f32, &VE::V64RegClass);
addRegisterClass(MVT::v16f32, &VE::V64RegClass);
addRegisterClass(MVT::v32f32, &VE::V64RegClass);
addRegisterClass(MVT::v64f32, &VE::V64RegClass);
addRegisterClass(MVT::v128f32, &VE::V64RegClass);
addRegisterClass(MVT::v256f32, &VE::V64RegClass);
addRegisterClass(MVT::v512f32, &VE::V64RegClass);
addRegisterClass(MVT::v2f64, &VE::V64RegClass);
addRegisterClass(MVT::v4f64, &VE::V64RegClass);
addRegisterClass(MVT::v8f64, &VE::V64RegClass);
addRegisterClass(MVT::v16f64, &VE::V64RegClass);
addRegisterClass(MVT::v32f64, &VE::V64RegClass);
addRegisterClass(MVT::v64f64, &VE::V64RegClass);
addRegisterClass(MVT::v128f64, &VE::V64RegClass);
addRegisterClass(MVT::v256f64, &VE::V64RegClass);
addRegisterClass(MVT::v256i1, &VE::VMRegClass);
addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
for (MVT VecVT : AllVectorVTs)
addRegisterClass(VecVT, &VE::V64RegClass);
for (MVT MaskVT : AllMaskVTs)
addRegisterClass(MaskVT, &VE::VMRegClass);
}
}
@ -285,7 +254,8 @@ void VETargetLowering::initSPUActions() {
}
void VETargetLowering::initVPUActions() {
// TODO upstream vector isel
for (MVT LegalVecVT : AllVectorVTs)
setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
}
SDValue
@ -898,6 +868,7 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
TARGET_NODE_CASE(GETTLSADDR)
TARGET_NODE_CASE(MEMBARRIER)
TARGET_NODE_CASE(CALL)
TARGET_NODE_CASE(VEC_BROADCAST)
TARGET_NODE_CASE(RET_FLAG)
TARGET_NODE_CASE(GLOBAL_BASE_REG)
}
@ -1403,6 +1374,32 @@ SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
return DAG.getMergeValues(Ops, DL);
}
static SDValue getSplatValue(SDNode *N) {
if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
return BuildVec->getSplatValue();
}
return SDValue();
}
SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
unsigned NumEls = Op.getValueType().getVectorNumElements();
MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
if (SDValue ScalarV = getSplatValue(Op.getNode())) {
// lower to VEC_BROADCAST
MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
AVL);
}
// Expand
return SDValue();
}
SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
@ -1423,6 +1420,8 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerJumpTable(Op, DAG);
case ISD::LOAD:
return lowerLOAD(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::STORE:
return lowerSTORE(Op, DAG);
case ISD::VASTART:

View File

@ -34,6 +34,8 @@ enum NodeType : unsigned {
MEMBARRIER, // Compiler barrier only; generate a no-op.
VEC_BROADCAST, // 0: scalar value, 1: VL
CALL, // A call instruction.
RET_FLAG, // Return with a flag operand.
GLOBAL_BASE_REG, // Global base reg for PIC.
@ -114,6 +116,8 @@ public:
SDValue lowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
/// } Custom Lower
/// Custom DAGCombine {

View File

@ -2224,3 +2224,6 @@ include "VEInstrVec.td"
// The vevlintrin
include "VEInstrIntrinsicVL.td"
// Patterns and intermediate SD nodes (VEC_*).
include "VEInstrPatternsVec.td"

View File

@ -0,0 +1,48 @@
//===-- VEInstrPatternsVec.td - VEC_-type SDNodes and isel for VE Target --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file describes the VEC_* prefixed intermediate SDNodes and their
// isel patterns.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Instruction format superclass
//===----------------------------------------------------------------------===//
// Custom intermediate ISDs.
class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2, [SDTCisVec<0>, IsVLVT<2>]>>;
multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp, SDNodeXForm ImmCast, int SubRegIdx> {
// VBRDil
def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)),
(VBRDil (ImmCast $sy), i32:$vl)>;
// VBRDrl
def : Pat<(v32 (vec_broadcast s32:$sy, i32:$vl)),
(VBRDrl
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, SubRegIdx),
i32:$vl)>;
}
defm : vbrd_elem32<v256f32, f32, simm7fp, LO7FP, sub_f32>;
defm : vbrd_elem32<v256i32, i32, simm7, LO7, sub_i32>;
multiclass vbrd_elem64<ValueType v64, ValueType s64, SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
// VBRDil
def : Pat<(v64 (vec_broadcast (s64 ImmOp:$sy), i32:$vl)),
(VBRDil (ImmCast $sy), i32:$vl)>;
// VBRDrl
def : Pat<(v64 (vec_broadcast s64:$sy, i32:$vl)),
(VBRDrl s64:$sy, i32:$vl)>;
}
defm : vbrd_elem64<v256f64, f64, simm7fp, LO7FP>;
defm : vbrd_elem64<v256i64, i64, simm7, LO7>;

View File

@ -185,14 +185,7 @@ def F128 : RegisterClass<"VE", [f128], 128,
def V64 : RegisterClass<"VE",
[v256f64, // default type for vector registers
v512i32, v512f32,
v256i64, v256i32, v256f32, /* v256f64, */
v128i64, v128i32, v128f32, v128f64,
v64i64, v64i32, v64f32, v64f64,
v32i64, v32i32, v32f32, v32f64,
v16i64, v16i32, v16f32, v16f64,
v8i64, v8i32, v8f32, v8f64,
v4i64, v4i32, v4f32, v4f64,
v2i64, v2i32, v2f32, v2f64], 64,
v256i64, v256i32, v256f32, /* v256f64, */], 64,
(add (sequence "V%u", 0, 63),
VIX)>;

View File

@ -0,0 +1,328 @@
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
; ISA-compatible vector broadcasts
define fastcc <256 x i64> @brd_v256i64(i64 %s) {
; CHECK-LABEL: brd_v256i64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x i64> undef, i64 %s, i32 0
%ret = shufflevector <256 x i64> %val, <256 x i64> undef, <256 x i32> zeroinitializer
ret <256 x i64> %ret
}
define fastcc <256 x i64> @brdi_v256i64() {
; CHECK-LABEL: brdi_v256i64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s0, 256
; CHECK-NEXT: lvl %s0
; CHECK-NEXT: vbrd %v0, 1
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x i64> undef, i64 1, i32 0
%ret = shufflevector <256 x i64> %val, <256 x i64> undef, <256 x i32> zeroinitializer
ret <256 x i64> %ret
}
define fastcc <256 x double> @brd_v256f64(double %s) {
; CHECK-LABEL: brd_v256f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x double> undef, double %s, i32 0
%ret = shufflevector <256 x double> %val, <256 x double> undef, <256 x i32> zeroinitializer
ret <256 x double> %ret
}
define fastcc <256 x double> @brdi_v256f64() {
; CHECK-LABEL: brdi_v256f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s0, 256
; CHECK-NEXT: lvl %s0
; CHECK-NEXT: vbrd %v0, 0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x double> undef, double 0.e+00, i32 0
%ret = shufflevector <256 x double> %val, <256 x double> undef, <256 x i32> zeroinitializer
ret <256 x double> %ret
}
define fastcc <256 x i32> @brd_v256i32(i32 %s) {
; CHECK-LABEL: brd_v256i32:
; CHECK: # %bb.0:
; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x i32> undef, i32 %s, i32 0
%ret = shufflevector <256 x i32> %val, <256 x i32> undef, <256 x i32> zeroinitializer
ret <256 x i32> %ret
}
define fastcc <256 x i32> @brdi_v256i32(i32 %s) {
; CHECK-LABEL: brdi_v256i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s0, 256
; CHECK-NEXT: lvl %s0
; CHECK-NEXT: vbrd %v0, 13
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x i32> undef, i32 13, i32 0
%ret = shufflevector <256 x i32> %val, <256 x i32> undef, <256 x i32> zeroinitializer
ret <256 x i32> %ret
}
define fastcc <256 x float> @brd_v256f32(float %s) {
; CHECK-LABEL: brd_v256f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x float> undef, float %s, i32 0
%ret = shufflevector <256 x float> %val, <256 x float> undef, <256 x i32> zeroinitializer
ret <256 x float> %ret
}
define fastcc <256 x float> @brdi_v256f32(float %s) {
; CHECK-LABEL: brdi_v256f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s0, 256
; CHECK-NEXT: lvl %s0
; CHECK-NEXT: vbrd %v0, 0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x float> undef, float 0.e+00, i32 0
%ret = shufflevector <256 x float> %val, <256 x float> undef, <256 x i32> zeroinitializer
ret <256 x float> %ret
}
; Shorter vectors, we expect these to be widened (for now).
define fastcc <128 x i64> @brd_v128i64(i64 %s) {
; CHECK-LABEL: brd_v128i64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x i64> undef, i64 %s, i32 0
%ret = shufflevector <128 x i64> %val, <128 x i64> undef, <128 x i32> zeroinitializer
ret <128 x i64> %ret
}
define fastcc <128 x double> @brd_v128f64(double %s) {
; CHECK-LABEL: brd_v128f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x double> undef, double %s, i32 0
%ret = shufflevector <128 x double> %val, <128 x double> undef, <128 x i32> zeroinitializer
ret <128 x double> %ret
}
define fastcc <128 x i32> @brd_v128i32(i32 %s) {
; CHECK-LABEL: brd_v128i32:
; CHECK: # %bb.0:
; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x i32> undef, i32 %s, i32 0
%ret = shufflevector <128 x i32> %val, <128 x i32> undef, <128 x i32> zeroinitializer
ret <128 x i32> %ret
}
define fastcc <128 x i32> @brdi_v128i32(i32 %s) {
; CHECK-LABEL: brdi_v128i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s0, 256
; CHECK-NEXT: lvl %s0
; CHECK-NEXT: vbrd %v0, 13
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x i32> undef, i32 13, i32 0
%ret = shufflevector <128 x i32> %val, <128 x i32> undef, <128 x i32> zeroinitializer
ret <128 x i32> %ret
}
define fastcc <128 x float> @brd_v128f32(float %s) {
; CHECK-LABEL: brd_v128f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x float> undef, float %s, i32 0
%ret = shufflevector <128 x float> %val, <128 x float> undef, <128 x i32> zeroinitializer
ret <128 x float> %ret
}
define fastcc <128 x float> @brdi_v128f32(float %s) {
; CHECK-LABEL: brdi_v128f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s0, 256
; CHECK-NEXT: lvl %s0
; CHECK-NEXT: vbrd %v0, 0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x float> undef, float 0.e+00, i32 0
%ret = shufflevector <128 x float> %val, <128 x float> undef, <128 x i32> zeroinitializer
ret <128 x float> %ret
}
; Vectors with small element types and valid element count, we expect those to be promoted.
define fastcc <256 x i16> @brd_v256i16(i16 %s) {
; CHECK-LABEL: brd_v256i16:
; CHECK: # %bb.0:
; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <256 x i16> undef, i16 %s, i32 0
%ret = shufflevector <256 x i16> %val, <256 x i16> undef, <256 x i32> zeroinitializer
ret <256 x i16> %ret
}
; Vectors with small element types and low element count, these are scalarized for now.
; FIXME Promote + Widen
define fastcc <128 x i16> @brd_v128i16(i16 %s) {
; CHECK-LABEL: brd_v128i16:
; CHECK: # %bb.0:
; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1
; CHECK-NEXT: st2b %s1, 254(, %s0)
; CHECK-NEXT: st2b %s1, 252(, %s0)
; CHECK-NEXT: st2b %s1, 250(, %s0)
; CHECK-NEXT: st2b %s1, 248(, %s0)
; CHECK-NEXT: st2b %s1, 246(, %s0)
; CHECK-NEXT: st2b %s1, 244(, %s0)
; CHECK-NEXT: st2b %s1, 242(, %s0)
; CHECK-NEXT: st2b %s1, 240(, %s0)
; CHECK-NEXT: st2b %s1, 238(, %s0)
; CHECK-NEXT: st2b %s1, 236(, %s0)
; CHECK-NEXT: st2b %s1, 234(, %s0)
; CHECK-NEXT: st2b %s1, 232(, %s0)
; CHECK-NEXT: st2b %s1, 230(, %s0)
; CHECK-NEXT: st2b %s1, 228(, %s0)
; CHECK-NEXT: st2b %s1, 226(, %s0)
; CHECK-NEXT: st2b %s1, 224(, %s0)
; CHECK-NEXT: st2b %s1, 222(, %s0)
; CHECK-NEXT: st2b %s1, 220(, %s0)
; CHECK-NEXT: st2b %s1, 218(, %s0)
; CHECK-NEXT: st2b %s1, 216(, %s0)
; CHECK-NEXT: st2b %s1, 214(, %s0)
; CHECK-NEXT: st2b %s1, 212(, %s0)
; CHECK-NEXT: st2b %s1, 210(, %s0)
; CHECK-NEXT: st2b %s1, 208(, %s0)
; CHECK-NEXT: st2b %s1, 206(, %s0)
; CHECK-NEXT: st2b %s1, 204(, %s0)
; CHECK-NEXT: st2b %s1, 202(, %s0)
; CHECK-NEXT: st2b %s1, 200(, %s0)
; CHECK-NEXT: st2b %s1, 198(, %s0)
; CHECK-NEXT: st2b %s1, 196(, %s0)
; CHECK-NEXT: st2b %s1, 194(, %s0)
; CHECK-NEXT: st2b %s1, 192(, %s0)
; CHECK-NEXT: st2b %s1, 190(, %s0)
; CHECK-NEXT: st2b %s1, 188(, %s0)
; CHECK-NEXT: st2b %s1, 186(, %s0)
; CHECK-NEXT: st2b %s1, 184(, %s0)
; CHECK-NEXT: st2b %s1, 182(, %s0)
; CHECK-NEXT: st2b %s1, 180(, %s0)
; CHECK-NEXT: st2b %s1, 178(, %s0)
; CHECK-NEXT: st2b %s1, 176(, %s0)
; CHECK-NEXT: st2b %s1, 174(, %s0)
; CHECK-NEXT: st2b %s1, 172(, %s0)
; CHECK-NEXT: st2b %s1, 170(, %s0)
; CHECK-NEXT: st2b %s1, 168(, %s0)
; CHECK-NEXT: st2b %s1, 166(, %s0)
; CHECK-NEXT: st2b %s1, 164(, %s0)
; CHECK-NEXT: st2b %s1, 162(, %s0)
; CHECK-NEXT: st2b %s1, 160(, %s0)
; CHECK-NEXT: st2b %s1, 158(, %s0)
; CHECK-NEXT: st2b %s1, 156(, %s0)
; CHECK-NEXT: st2b %s1, 154(, %s0)
; CHECK-NEXT: st2b %s1, 152(, %s0)
; CHECK-NEXT: st2b %s1, 150(, %s0)
; CHECK-NEXT: st2b %s1, 148(, %s0)
; CHECK-NEXT: st2b %s1, 146(, %s0)
; CHECK-NEXT: st2b %s1, 144(, %s0)
; CHECK-NEXT: st2b %s1, 142(, %s0)
; CHECK-NEXT: st2b %s1, 140(, %s0)
; CHECK-NEXT: st2b %s1, 138(, %s0)
; CHECK-NEXT: st2b %s1, 136(, %s0)
; CHECK-NEXT: st2b %s1, 134(, %s0)
; CHECK-NEXT: st2b %s1, 132(, %s0)
; CHECK-NEXT: st2b %s1, 130(, %s0)
; CHECK-NEXT: st2b %s1, 128(, %s0)
; CHECK-NEXT: st2b %s1, 126(, %s0)
; CHECK-NEXT: st2b %s1, 124(, %s0)
; CHECK-NEXT: st2b %s1, 122(, %s0)
; CHECK-NEXT: st2b %s1, 120(, %s0)
; CHECK-NEXT: st2b %s1, 118(, %s0)
; CHECK-NEXT: st2b %s1, 116(, %s0)
; CHECK-NEXT: st2b %s1, 114(, %s0)
; CHECK-NEXT: st2b %s1, 112(, %s0)
; CHECK-NEXT: st2b %s1, 110(, %s0)
; CHECK-NEXT: st2b %s1, 108(, %s0)
; CHECK-NEXT: st2b %s1, 106(, %s0)
; CHECK-NEXT: st2b %s1, 104(, %s0)
; CHECK-NEXT: st2b %s1, 102(, %s0)
; CHECK-NEXT: st2b %s1, 100(, %s0)
; CHECK-NEXT: st2b %s1, 98(, %s0)
; CHECK-NEXT: st2b %s1, 96(, %s0)
; CHECK-NEXT: st2b %s1, 94(, %s0)
; CHECK-NEXT: st2b %s1, 92(, %s0)
; CHECK-NEXT: st2b %s1, 90(, %s0)
; CHECK-NEXT: st2b %s1, 88(, %s0)
; CHECK-NEXT: st2b %s1, 86(, %s0)
; CHECK-NEXT: st2b %s1, 84(, %s0)
; CHECK-NEXT: st2b %s1, 82(, %s0)
; CHECK-NEXT: st2b %s1, 80(, %s0)
; CHECK-NEXT: st2b %s1, 78(, %s0)
; CHECK-NEXT: st2b %s1, 76(, %s0)
; CHECK-NEXT: st2b %s1, 74(, %s0)
; CHECK-NEXT: st2b %s1, 72(, %s0)
; CHECK-NEXT: st2b %s1, 70(, %s0)
; CHECK-NEXT: st2b %s1, 68(, %s0)
; CHECK-NEXT: st2b %s1, 66(, %s0)
; CHECK-NEXT: st2b %s1, 64(, %s0)
; CHECK-NEXT: st2b %s1, 62(, %s0)
; CHECK-NEXT: st2b %s1, 60(, %s0)
; CHECK-NEXT: st2b %s1, 58(, %s0)
; CHECK-NEXT: st2b %s1, 56(, %s0)
; CHECK-NEXT: st2b %s1, 54(, %s0)
; CHECK-NEXT: st2b %s1, 52(, %s0)
; CHECK-NEXT: st2b %s1, 50(, %s0)
; CHECK-NEXT: st2b %s1, 48(, %s0)
; CHECK-NEXT: st2b %s1, 46(, %s0)
; CHECK-NEXT: st2b %s1, 44(, %s0)
; CHECK-NEXT: st2b %s1, 42(, %s0)
; CHECK-NEXT: st2b %s1, 40(, %s0)
; CHECK-NEXT: st2b %s1, 38(, %s0)
; CHECK-NEXT: st2b %s1, 36(, %s0)
; CHECK-NEXT: st2b %s1, 34(, %s0)
; CHECK-NEXT: st2b %s1, 32(, %s0)
; CHECK-NEXT: st2b %s1, 30(, %s0)
; CHECK-NEXT: st2b %s1, 28(, %s0)
; CHECK-NEXT: st2b %s1, 26(, %s0)
; CHECK-NEXT: st2b %s1, 24(, %s0)
; CHECK-NEXT: st2b %s1, 22(, %s0)
; CHECK-NEXT: st2b %s1, 20(, %s0)
; CHECK-NEXT: st2b %s1, 18(, %s0)
; CHECK-NEXT: st2b %s1, 16(, %s0)
; CHECK-NEXT: st2b %s1, 14(, %s0)
; CHECK-NEXT: st2b %s1, 12(, %s0)
; CHECK-NEXT: st2b %s1, 10(, %s0)
; CHECK-NEXT: st2b %s1, 8(, %s0)
; CHECK-NEXT: st2b %s1, 6(, %s0)
; CHECK-NEXT: st2b %s1, 4(, %s0)
; CHECK-NEXT: st2b %s1, 2(, %s0)
; CHECK-NEXT: st2b %s1, (, %s0)
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <128 x i16> undef, i16 %s, i32 0
%ret = shufflevector <128 x i16> %val, <128 x i16> undef, <128 x i32> zeroinitializer
ret <128 x i16> %ret
}