mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[llvm][aarch64] SVE addressing modes.
Summary: Added register + immediate and register + register addressing modes for the following intrinsics: 1. Masked load and stores: * Sign and zero extended load and truncated stores. * No extension or truncation. 2. Masked non-temporal load and store. Reviewers: andwar, efriedma Subscribers: cameron.mcinally, sdesmalen, tschuett, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D74254
This commit is contained in:
parent
11344326c1
commit
ab5eafb42f
@ -221,6 +221,15 @@ public:
|
||||
void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
|
||||
|
||||
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
|
||||
/// SVE Reg+Imm addressing mode.
|
||||
template <int64_t Min, int64_t Max>
|
||||
bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
|
||||
SDValue &OffImm);
|
||||
/// SVE Reg+Reg address mode.
|
||||
template <unsigned Scale>
|
||||
bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
|
||||
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
|
||||
}
|
||||
|
||||
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
|
||||
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
|
||||
@ -282,6 +291,8 @@ private:
|
||||
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
|
||||
|
||||
bool SelectSVEArithImm(SDValue N, SDValue &Imm);
|
||||
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
|
||||
SDValue &Offset);
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
@ -4427,3 +4438,72 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
|
||||
CodeGenOpt::Level OptLevel) {
|
||||
return new AArch64DAGToDAGISel(TM, OptLevel);
|
||||
}
|
||||
|
||||
/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
|
||||
/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
|
||||
/// where Root is the memory access using N for its address.
|
||||
template <int64_t Min, int64_t Max>
|
||||
bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
|
||||
SDValue &Base,
|
||||
SDValue &OffImm) {
|
||||
assert(isa<MemSDNode>(Root) && "Invalid node.");
|
||||
|
||||
EVT MemVT = cast<MemSDNode>(Root)->getMemoryVT();
|
||||
|
||||
if (N.getOpcode() != ISD::ADD)
|
||||
return false;
|
||||
|
||||
SDValue VScale = N.getOperand(1);
|
||||
if (VScale.getOpcode() != ISD::VSCALE)
|
||||
return false;
|
||||
|
||||
TypeSize TS = MemVT.getSizeInBits();
|
||||
int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
|
||||
int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
|
||||
|
||||
if ((MulImm % MemWidthBytes) != 0)
|
||||
return false;
|
||||
|
||||
int64_t Offset = MulImm / MemWidthBytes;
|
||||
if (Offset < Min || Offset > Max)
|
||||
return false;
|
||||
|
||||
Base = N.getOperand(0);
|
||||
OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Select register plus register addressing mode for SVE, with scaled
|
||||
/// offset.
|
||||
bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
|
||||
SDValue &Base,
|
||||
SDValue &Offset) {
|
||||
if (N.getOpcode() != ISD::ADD)
|
||||
return false;
|
||||
|
||||
// Process an ADD node.
|
||||
const SDValue LHS = N.getOperand(0);
|
||||
const SDValue RHS = N.getOperand(1);
|
||||
|
||||
// 8 bit data does not come with the SHL node, so it is treated
|
||||
// separately.
|
||||
if (Scale == 0) {
|
||||
Base = LHS;
|
||||
Offset = RHS;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if the RHS is a shift node with a constant.
|
||||
if (RHS.getOpcode() != ISD::SHL)
|
||||
return false;
|
||||
|
||||
const SDValue ShiftRHS = RHS.getOperand(1);
|
||||
if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
|
||||
if (C->getZExtValue() == Scale) {
|
||||
Base = LHS;
|
||||
Offset = RHS.getOperand(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -1270,81 +1270,100 @@ let Predicates = [HasSVE] in {
|
||||
|
||||
// Add more complex addressing modes here as required
|
||||
multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
|
||||
Instruction RegImmInst> {
|
||||
|
||||
Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
|
||||
// reg + reg
|
||||
let AddedComplexity = 1 in {
|
||||
def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
|
||||
(RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
|
||||
}
|
||||
// reg + imm
|
||||
let AddedComplexity = 2 in {
|
||||
def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
|
||||
(RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
|
||||
}
|
||||
def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
|
||||
(RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
|
||||
}
|
||||
|
||||
// 2-element contiguous loads
|
||||
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D_IMM>;
|
||||
defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D_IMM>;
|
||||
defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D_IMM>;
|
||||
defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D_IMM>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
|
||||
defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
|
||||
|
||||
// 4-element contiguous loads
|
||||
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S_IMM>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S_IMM>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S_IMM>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W_IMM>;
|
||||
defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S_IMM>;
|
||||
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W_IMM>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
|
||||
|
||||
// 8-element contiguous loads
|
||||
defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H_IMM>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H_IMM>;
|
||||
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H_IMM>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
|
||||
|
||||
// 16-element contiguous loads
|
||||
defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>;
|
||||
defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
|
||||
|
||||
multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
|
||||
Instruction RegImmInst> {
|
||||
Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
|
||||
// reg + reg
|
||||
let AddedComplexity = 1 in {
|
||||
def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
|
||||
(RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
|
||||
}
|
||||
// reg + imm
|
||||
let AddedComplexity = 2 in {
|
||||
def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
|
||||
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
|
||||
}
|
||||
def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
|
||||
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
|
||||
}
|
||||
|
||||
// 2-element contiguous stores
|
||||
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D_IMM>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
|
||||
defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D_IMM>;
|
||||
defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D_IMM>;
|
||||
defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
|
||||
defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
|
||||
|
||||
// 4-element contiguous stores
|
||||
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S_IMM>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
|
||||
defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S_IMM>;
|
||||
defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
|
||||
defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
|
||||
|
||||
// 8-element contiguous stores
|
||||
defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>;
|
||||
defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
|
||||
defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
|
||||
defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
|
||||
defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
|
||||
|
||||
// 16-element contiguous stores
|
||||
defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>;
|
||||
defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
|
||||
|
||||
defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRI>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRI>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRI>;
|
||||
defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
|
||||
defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
|
||||
defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
|
||||
defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;
|
||||
|
||||
defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>;
|
||||
defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>;
|
||||
defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
|
||||
defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
|
||||
defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
|
||||
|
||||
multiclass unpred_store<ValueType Ty, Instruction RegImmInst, Instruction PTrue> {
|
||||
def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
|
||||
|
@ -7059,3 +7059,11 @@ class sve2_crypto_unary_op<bit opc, string asm>
|
||||
|
||||
let Constraints = "$Zdn = $_Zdn";
|
||||
}
|
||||
|
||||
/// Addressing modes
|
||||
def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
|
||||
|
||||
def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
|
||||
def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
|
||||
def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
|
||||
def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;
|
||||
|
@ -0,0 +1,622 @@
|
||||
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
|
||||
|
||||
; Range checks: for all the instruction tested in this file, the
|
||||
; immediate must be within the range [-8, 7] (4-bit immediate). Out of
|
||||
; range values are tested only in one case (following). Valid values
|
||||
; are tested all through the rest of the file.
|
||||
|
||||
define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: imm_out_of_range:
|
||||
; CHECK-NEXT: rdvl x8, #8
|
||||
; CHECK-NEXT: add x8, x0, x8
|
||||
; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: rdvl x8, #-9
|
||||
; CHECK-NEXT: add x8, x0, x8
|
||||
; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
|
||||
%data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64> undef)
|
||||
%base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9
|
||||
call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
|
||||
<vscale x 2 x i64>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-lane contiguous load/stores
|
||||
|
||||
define void @test_masked_ldst_sv2i8(<vscale x 2 x i8> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i8:
|
||||
; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -8
|
||||
%data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i8> undef)
|
||||
%base_store = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8> * %base, i64 -7
|
||||
call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
|
||||
<vscale x 2 x i8>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2i16(<vscale x 2 x i16> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i16:
|
||||
; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 -8
|
||||
%data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i16> undef)
|
||||
%base_store = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16> * %base, i64 -7
|
||||
call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
|
||||
<vscale x 2 x i16>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @test_masked_ldst_sv2i32(<vscale x 2 x i32> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i32:
|
||||
; CHECK-NEXT: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -8
|
||||
%data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i32> undef)
|
||||
%base_store = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32> * %base, i64 -7
|
||||
call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
|
||||
<vscale x 2 x i32>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i64:
|
||||
; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8
|
||||
%data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64> undef)
|
||||
%base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7
|
||||
call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
|
||||
<vscale x 2 x i64>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f16(<vscale x 2 x half> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x half>, <vscale x 2 x half>* %base, i64 -8
|
||||
%data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x half> undef)
|
||||
%base_store = getelementptr <vscale x 2 x half>, <vscale x 2 x half> * %base, i64 -7
|
||||
call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
|
||||
<vscale x 2 x half>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @test_masked_ldst_sv2f32(<vscale x 2 x float> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f32:
|
||||
; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x float>, <vscale x 2 x float>* %base, i64 -8
|
||||
%data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x float> undef)
|
||||
%base_store = getelementptr <vscale x 2 x float>, <vscale x 2 x float> * %base, i64 -7
|
||||
call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
|
||||
<vscale x 2 x float>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f64:
|
||||
; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl]
|
||||
; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6
|
||||
%data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x double> undef)
|
||||
%base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5
|
||||
call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
|
||||
<vscale x 2 x double>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
|
||||
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -4
|
||||
%load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i8> undef)
|
||||
%ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
|
||||
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -3
|
||||
%load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i8> undef)
|
||||
%ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
|
||||
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 1
|
||||
%load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i16> undef)
|
||||
%ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
|
||||
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 2
|
||||
%load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i16> undef)
|
||||
%ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -2
|
||||
%load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i32> undef)
|
||||
%ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
|
||||
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -1
|
||||
%load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i32> undef)
|
||||
%ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
; 2-lane truncating contiguous stores.
|
||||
|
||||
define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, <vscale x 2 x i8> *%base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
|
||||
; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 3
|
||||
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
|
||||
call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
|
||||
<vscale x 2 x i8> *%base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, <vscale x 2 x i16> *%base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
|
||||
; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 4
|
||||
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
|
||||
call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
|
||||
<vscale x 2 x i16> *%base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, <vscale x 2 x i32> *%base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 5
|
||||
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
|
||||
call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
|
||||
<vscale x 2 x i32> *%base_load,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 4-lane contiguous load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv4i8(<vscale x 4 x i8> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i8:
|
||||
; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -1
|
||||
%data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i8> undef)
|
||||
%base_store = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8> * %base, i64 2
|
||||
call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
|
||||
<vscale x 4 x i8>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4i16(<vscale x 4 x i16> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i16:
|
||||
; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 -1
|
||||
%data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i16> undef)
|
||||
%base_store = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16> * %base, i64 2
|
||||
call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
|
||||
<vscale x 4 x i16>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i32:
|
||||
; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6
|
||||
%data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i32> undef)
|
||||
%base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7
|
||||
call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
|
||||
<vscale x 4 x i32>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4f16(<vscale x 4 x half> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4f16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x half>, <vscale x 4 x half>* %base, i64 -1
|
||||
%data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x half> undef)
|
||||
%base_store = getelementptr <vscale x 4 x half>, <vscale x 4 x half> * %base, i64 2
|
||||
call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
|
||||
<vscale x 4 x half>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4f32:
|
||||
; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1
|
||||
%data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x float> undef)
|
||||
%base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2
|
||||
call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
|
||||
<vscale x 4 x float>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 4-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
|
||||
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -4
|
||||
%load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i8> undef)
|
||||
%ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
|
||||
; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -3
|
||||
%load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i8> undef)
|
||||
%ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 1
|
||||
%load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i16> undef)
|
||||
%ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
|
||||
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 2
|
||||
%load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i16> undef)
|
||||
%ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
; 4-lane truncating contiguous stores.
|
||||
|
||||
define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, <vscale x 4 x i8> *%base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
|
||||
; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 3
|
||||
%trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
|
||||
call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
|
||||
<vscale x 4 x i8> *%base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, <vscale x 4 x i16> *%base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 4
|
||||
%trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
|
||||
call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
|
||||
<vscale x 4 x i16> *%base_load,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 8-lane contiguous load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv8i8(<vscale x 8 x i8> * %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8i8:
|
||||
; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 6
|
||||
%data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i8> undef)
|
||||
%base_store = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8> * %base, i64 7
|
||||
call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
|
||||
<vscale x 8 x i8>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8i16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6
|
||||
%data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i16> undef)
|
||||
%base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7
|
||||
call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
|
||||
<vscale x 8 x i16>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8f16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1
|
||||
%data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x half> undef)
|
||||
%base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2
|
||||
call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
|
||||
<vscale x 8 x half>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 8-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
|
||||
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -4
|
||||
%load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i8> undef)
|
||||
%ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
|
||||
ret <vscale x 8 x i16> %ext
|
||||
}
|
||||
|
||||
define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
|
||||
; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -3
|
||||
%load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i8> undef)
|
||||
%ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
|
||||
ret <vscale x 8 x i16> %ext
|
||||
}
|
||||
|
||||
; 8-lane truncating contiguous stores.
|
||||
|
||||
define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, <vscale x 8 x i8> *%base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
|
||||
; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 3
|
||||
%trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
|
||||
call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
|
||||
<vscale x 8 x i8> *%base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 16-lane contiguous load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv16i8:
|
||||
; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6
|
||||
%data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 16 x i1> %mask,
|
||||
<vscale x 16 x i8> undef)
|
||||
%base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7
|
||||
call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
|
||||
<vscale x 16 x i8>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 16 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-element contiguous loads.
|
||||
declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
|
||||
declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
|
||||
declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
|
||||
declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
|
||||
declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
|
||||
declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
|
||||
declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
|
||||
|
||||
; 4-element contiguous loads.
|
||||
declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
|
||||
declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
|
||||
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
|
||||
declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
|
||||
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
|
||||
|
||||
; 8-element contiguous loads.
|
||||
declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
|
||||
declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
|
||||
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
|
||||
|
||||
; 16-element contiguous loads.
|
||||
declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
|
||||
|
||||
; 2-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
|
||||
|
||||
; 4-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
|
||||
|
||||
; 8-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
|
||||
|
||||
; 16-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
|
@ -0,0 +1,610 @@
|
||||
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
|
||||
|
||||
; 2-lane contiguous load/stores
|
||||
|
||||
define void @test_masked_ldst_sv2i8(i8 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i8:
|
||||
; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
|
||||
%data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i8> undef)
|
||||
call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
|
||||
<vscale x 2 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2i16(i16 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i16:
|
||||
; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
|
||||
%data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i16> undef)
|
||||
call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
|
||||
<vscale x 2 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2i32(i32 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i32:
|
||||
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i32 = getelementptr i32, i32* %base, i64 %offset
|
||||
%base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
|
||||
%data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i32> undef)
|
||||
call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
|
||||
<vscale x 2 x i32>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2i64(i64 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i64:
|
||||
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i64 = getelementptr i64, i64* %base, i64 %offset
|
||||
%base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>*
|
||||
%data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64> undef)
|
||||
call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
|
||||
<vscale x 2 x i64>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f16(half * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_half = getelementptr half, half* %base, i64 %offset
|
||||
%base_addr = bitcast half* %base_half to <vscale x 2 x half>*
|
||||
%data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x half> undef)
|
||||
call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
|
||||
<vscale x 2 x half>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f32(float * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f32:
|
||||
; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_float = getelementptr float, float* %base, i64 %offset
|
||||
%base_addr = bitcast float* %base_float to <vscale x 2 x float>*
|
||||
%data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x float> undef)
|
||||
call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
|
||||
<vscale x 2 x float>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f64(double * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f64:
|
||||
; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: ret
|
||||
%base_double = getelementptr double, double* %base, i64 %offset
|
||||
%base_addr = bitcast double* %base_double to <vscale x 2 x double>*
|
||||
%data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x double> undef)
|
||||
call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
|
||||
<vscale x 2 x double>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
|
||||
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
|
||||
%load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i8> undef)
|
||||
%ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
|
||||
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
|
||||
%load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i8> undef)
|
||||
%ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
|
||||
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
|
||||
%load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i16> undef)
|
||||
%ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
|
||||
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
|
||||
%load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i16> undef)
|
||||
%ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
|
||||
define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i32 = getelementptr i32, i32* %base, i64 %offset
|
||||
%base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
|
||||
%load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i32> undef)
|
||||
%ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
|
||||
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i32 = getelementptr i32, i32* %base, i64 %offset
|
||||
%base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
|
||||
%load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i32> undef)
|
||||
%ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
|
||||
ret <vscale x 2 x i64> %ext
|
||||
}
|
||||
|
||||
; 2-lane truncating contiguous stores.
|
||||
|
||||
define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, i8 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
|
||||
; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
|
||||
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
|
||||
call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
|
||||
<vscale x 2 x i8> *%base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, i16 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
|
||||
; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
|
||||
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
|
||||
call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
|
||||
<vscale x 2 x i16> *%base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, i32 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i32 = getelementptr i32, i32* %base, i64 %offset
|
||||
%base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
|
||||
%trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
|
||||
call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
|
||||
<vscale x 2 x i32> *%base_addr,
|
||||
i32 1,
|
||||
<vscale x 2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 4-lane contiguous load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv4i8(i8 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i8:
|
||||
; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
|
||||
%data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i8> undef)
|
||||
call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
|
||||
<vscale x 4 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4i16(i16 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i16:
|
||||
; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
|
||||
%data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i16> undef)
|
||||
call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
|
||||
<vscale x 4 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4i32(i32 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i32:
|
||||
; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i32 = getelementptr i32, i32* %base, i64 %offset
|
||||
%base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>*
|
||||
%data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i32> undef)
|
||||
call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
|
||||
<vscale x 4 x i32>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4f16(half * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4f16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_f16 = getelementptr half, half* %base, i64 %offset
|
||||
%base_addr = bitcast half* %base_f16 to <vscale x 4 x half>*
|
||||
%data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x half> undef)
|
||||
call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
|
||||
<vscale x 4 x half>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4f32(float * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4f32:
|
||||
; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_f32 = getelementptr float, float* %base, i64 %offset
|
||||
%base_addr = bitcast float* %base_f32 to <vscale x 4 x float>*
|
||||
%data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x float> undef)
|
||||
call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
|
||||
<vscale x 4 x float>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 4-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
|
||||
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
|
||||
%load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i8> undef)
|
||||
%ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
|
||||
; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
|
||||
%load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i8> undef)
|
||||
%ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
|
||||
%load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i16> undef)
|
||||
%ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
|
||||
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
|
||||
%load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i16> undef)
|
||||
%ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
|
||||
ret <vscale x 4 x i32> %ext
|
||||
}
|
||||
|
||||
; 4-lane truncating contiguous stores.
|
||||
|
||||
define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, i8 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
|
||||
; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
|
||||
%trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
|
||||
call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
|
||||
<vscale x 4 x i8> *%base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, i16 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
|
||||
%trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
|
||||
call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
|
||||
<vscale x 4 x i16> *%base_addr,
|
||||
i32 1,
|
||||
<vscale x 4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 8-lane contiguous load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv8i8(i8 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8i8:
|
||||
; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
|
||||
%data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i8> undef)
|
||||
call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
|
||||
<vscale x 8 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8i16(i16 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8i16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>*
|
||||
%data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i16> undef)
|
||||
call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
|
||||
<vscale x 8 x i16>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8f16(half * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8f16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_f16 = getelementptr half, half* %base, i64 %offset
|
||||
%base_addr = bitcast half* %base_f16 to <vscale x 8 x half>*
|
||||
%data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x half> undef)
|
||||
call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
|
||||
<vscale x 8 x half>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 8-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
|
||||
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
|
||||
%load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i8> undef)
|
||||
%ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
|
||||
ret <vscale x 8 x i16> %ext
|
||||
}
|
||||
|
||||
define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
|
||||
; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
|
||||
%load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i8> undef)
|
||||
%ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
|
||||
ret <vscale x 8 x i16> %ext
|
||||
}
|
||||
|
||||
; 8-lane truncating contiguous stores.
|
||||
|
||||
define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, i8 *%base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
|
||||
; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
|
||||
%trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
|
||||
call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
|
||||
<vscale x 8 x i8> *%base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 16-lane contiguous load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv16i8(i8 * %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv16i8:
|
||||
; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>*
|
||||
%data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 16 x i1> %mask,
|
||||
<vscale x 16 x i8> undef)
|
||||
call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
|
||||
<vscale x 16 x i8>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 16 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-element contiguous loads.
|
||||
declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
|
||||
declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
|
||||
declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
|
||||
declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
|
||||
declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
|
||||
declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
|
||||
declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
|
||||
|
||||
; 4-element contiguous loads.
|
||||
declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
|
||||
declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
|
||||
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
|
||||
declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
|
||||
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
|
||||
|
||||
; 8-element contiguous loads.
|
||||
declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
|
||||
declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
|
||||
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
|
||||
|
||||
; 16-element contiguous loads.
|
||||
declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
|
||||
|
||||
; 2-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
|
||||
declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
|
||||
|
||||
; 4-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
|
||||
|
||||
; 8-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
|
||||
|
||||
; 16-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
|
@ -0,0 +1,171 @@
|
||||
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
|
||||
|
||||
; Range checks: for all the instruction tested in this file, the
|
||||
; immediate must be within the range [-8, 7] (4-bit immediate). Out of
|
||||
; range values are tested only in one case (following). Valid values
|
||||
; are tested all through the rest of the file.
|
||||
|
||||
define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: imm_out_of_range:
|
||||
; CHECK-NEXT: rdvl x8, #8
|
||||
; CHECK-NEXT: add x8, x0, x8
|
||||
; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: rdvl x8, #-9
|
||||
; CHECK-NEXT: add x8, x0, x8
|
||||
; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
|
||||
%data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64>* %base_load)
|
||||
%base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9
|
||||
call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-lane non-temporal load/stores
|
||||
|
||||
|
||||
define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i64:
|
||||
; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
|
||||
; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8
|
||||
%data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64>* %base_load)
|
||||
%base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7
|
||||
call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f64:
|
||||
; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl]
|
||||
; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-5, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6
|
||||
%data = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x double>* %base_load)
|
||||
%base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5
|
||||
call void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double> %data,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x double>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 4-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i32:
|
||||
; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6
|
||||
%data = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i32>* %base_load)
|
||||
%base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7
|
||||
call void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32> %data,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i32>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4f32:
|
||||
; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1
|
||||
%data = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x float>* %base_load)
|
||||
%base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2
|
||||
call void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float> %data,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x float>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; 8-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8i16:
|
||||
; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6
|
||||
%data = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i16>* %base_load)
|
||||
%base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16> %data,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i16>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8f16:
|
||||
; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1
|
||||
%data = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x half>* %base_load)
|
||||
%base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half> %data,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x half>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 16-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv16i8:
|
||||
; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl]
|
||||
; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, #7, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6
|
||||
%data = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %mask,
|
||||
<vscale x 16 x i8>* %base_load)
|
||||
%base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7
|
||||
call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %data,
|
||||
<vscale x 16 x i1> %mask,
|
||||
<vscale x 16 x i8>* %base_store)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-element non-temporal loads.
|
||||
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
||||
; 4-element non-temporal loads.
|
||||
declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
|
||||
; 8-element non-temporal loads.
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
|
||||
; 16-element non-temporal loads.
|
||||
declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
|
||||
|
||||
; 2-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
||||
; 4-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
|
||||
; 8-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
|
||||
; 16-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
|
||||
|
@ -0,0 +1,145 @@
|
||||
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
|
||||
|
||||
; 2-lane non-temporal load/stores
|
||||
|
||||
define void @test_masked_ldst_sv2i64(i64* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2i64:
|
||||
; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i64 = getelementptr i64, i64* %base, i64 %offset
|
||||
%base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>*
|
||||
%data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x i64>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv2f64(double* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv2f64:
|
||||
; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
|
||||
; CHECK-NEXT: ret
|
||||
%base_double = getelementptr double, double* %base, i64 %offset
|
||||
%base_addr = bitcast double* %base_double to <vscale x 2 x double>*
|
||||
%data = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x double>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double> %data,
|
||||
<vscale x 2 x i1> %mask,
|
||||
<vscale x 2 x double>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 4-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv4i32(i32* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4i32:
|
||||
; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i32 = getelementptr i32, i32* %base, i64 %offset
|
||||
%base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>*
|
||||
%data = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i32>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32> %data,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x i32>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv4f32(float* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv4f32:
|
||||
; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
|
||||
; CHECK-NEXT: ret
|
||||
%base_float = getelementptr float, float* %base, i64 %offset
|
||||
%base_addr = bitcast float* %base_float to <vscale x 4 x float>*
|
||||
%data = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x float>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float> %data,
|
||||
<vscale x 4 x i1> %mask,
|
||||
<vscale x 4 x float>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; 8-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv8i16(i16* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8i16:
|
||||
; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i16 = getelementptr i16, i16* %base, i64 %offset
|
||||
%base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>*
|
||||
%data = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i16>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16> %data,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x i16>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8f16(half* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8f16:
|
||||
; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_half = getelementptr half, half* %base, i64 %offset
|
||||
%base_addr = bitcast half* %base_half to <vscale x 8 x half>*
|
||||
%data = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x half>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half> %data,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x half>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 16-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv16i8(i8* %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
|
||||
; CHECK-LABEL: test_masked_ldst_sv16i8:
|
||||
; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
|
||||
; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, x1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_i8 = getelementptr i8, i8* %base, i64 %offset
|
||||
%base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>*
|
||||
%data = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %mask,
|
||||
<vscale x 16 x i8>* %base_addr)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %data,
|
||||
<vscale x 16 x i1> %mask,
|
||||
<vscale x 16 x i8>* %base_addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 2-element non-temporal loads.
|
||||
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
||||
; 4-element non-temporal loads.
|
||||
declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
|
||||
; 8-element non-temporal loads.
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
|
||||
; 16-element non-temporal loads.
|
||||
declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
|
||||
|
||||
; 2-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
||||
; 4-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
|
||||
; 8-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
|
||||
; 16-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
|
Loading…
Reference in New Issue
Block a user