mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[AArch64][SVE] Add bfloat16 support to store intrinsics
Summary: Bfloat16 support added for the following intrinsics: - ST1 - STNT1 Reviewers: sdesmalen, c-rhodes, fpetrogalli, efriedma, stuij, david-arm Reviewed By: fpetrogalli Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D82448
This commit is contained in:
parent
2a4dbacbc5
commit
eb10451902
@ -12037,6 +12037,7 @@ static MVT getSVEContainerType(EVT ContentTy) {
|
||||
case MVT::nxv8i8:
|
||||
case MVT::nxv8i16:
|
||||
case MVT::nxv8f16:
|
||||
case MVT::nxv8bf16:
|
||||
return MVT::nxv8i16;
|
||||
case MVT::nxv16i8:
|
||||
return MVT::nxv16i8;
|
||||
@ -12127,6 +12128,11 @@ static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
|
||||
EVT HwSrcVt = getSVEContainerType(DataVT);
|
||||
SDValue InputVT = DAG.getValueType(DataVT);
|
||||
|
||||
if (DataVT == MVT::nxv8bf16)
|
||||
assert(
|
||||
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16() &&
|
||||
"Unsupported type (BF16)");
|
||||
|
||||
if (DataVT.isFloatingPoint())
|
||||
InputVT = DAG.getValueType(HwSrcVt);
|
||||
|
||||
@ -12153,6 +12159,11 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
|
||||
EVT DataVT = Data.getValueType();
|
||||
EVT PtrTy = N->getOperand(4).getValueType();
|
||||
|
||||
if (DataVT == MVT::nxv8bf16)
|
||||
assert(
|
||||
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16() &&
|
||||
"Unsupported type (BF16)");
|
||||
|
||||
if (DataVT.isFloatingPoint())
|
||||
Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
|
||||
|
||||
|
@ -1444,6 +1444,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
||||
def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
|
||||
def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
|
||||
def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
|
||||
def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
|
||||
def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
|
||||
def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
|
||||
|
||||
@ -1595,6 +1596,10 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
||||
defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
|
||||
defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
|
||||
|
||||
let Predicates = [HasBF16, HasSVE] in {
|
||||
defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
|
||||
}
|
||||
|
||||
// 16-element contiguous stores
|
||||
defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
|
||||
|
||||
|
@ -126,6 +126,17 @@ define void @st1h_f16_inbound(<vscale x 8 x half> %data, <vscale x 8 x i1> %pg,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st1h_bf16_inbound(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, bfloat* %a) #0 {
|
||||
; CHECK-LABEL: st1h_bf16_inbound:
|
||||
; CHECK: st1h { z0.h }, p0, [x0, #-5, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
|
||||
%base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 -5
|
||||
%base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
|
||||
call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, bfloat* %base_scalar)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st1h_s_inbound(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %a) {
|
||||
; CHECK-LABEL: st1h_s_inbound:
|
||||
; CHECK: st1h { z0.s }, p0, [x0, #2, mul vl]
|
||||
@ -219,6 +230,7 @@ declare void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1
|
||||
declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*)
|
||||
@ -230,3 +242,6 @@ declare void @llvm.aarch64.sve.st1.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>
|
||||
declare void @llvm.aarch64.sve.st1.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -82,6 +82,17 @@ define void @st1h_f16(<vscale x 8 x half> %data, <vscale x 8 x i1> %pred, half*
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %a, i64 %index) #0 {
|
||||
; CHECK-LABEL: st1h_bf16:
|
||||
; CHECK: st1h { z0.h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base = getelementptr bfloat, bfloat* %a, i64 %index
|
||||
call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x i1> %pred,
|
||||
bfloat* %base)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pred, i16* %addr) {
|
||||
; CHECK-LABEL: st1h_s:
|
||||
; CHECK: st1h { z0.s }, p0, [x0]
|
||||
@ -174,6 +185,7 @@ declare void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1
|
||||
declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*)
|
||||
@ -185,3 +197,6 @@ declare void @llvm.aarch64.sve.st1.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>
|
||||
declare void @llvm.aarch64.sve.st1.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -75,6 +75,16 @@ define void @st1h_f16(<vscale x 8 x half> %data, <vscale x 8 x i1> %pred, half*
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %addr) #0 {
|
||||
; CHECK-LABEL: st1h_bf16:
|
||||
; CHECK: st1h { z0.h }, p0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x i1> %pred,
|
||||
bfloat* %addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pred, i16* %addr) {
|
||||
; CHECK-LABEL: st1h_s:
|
||||
; CHECK: st1h { z0.s }, p0, [x0]
|
||||
@ -161,6 +171,7 @@ declare void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1
|
||||
declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
|
||||
|
||||
declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*)
|
||||
@ -172,3 +183,6 @@ declare void @llvm.aarch64.sve.st1.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>
|
||||
declare void @llvm.aarch64.sve.st1.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
|
||||
declare void @llvm.aarch64.sve.st1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -44,7 +44,7 @@ define void @st2h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st2h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
define void @st2h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, bfloat* %addr) #0 {
|
||||
; CHECK-LABEL: st2h_bf16:
|
||||
; CHECK: st2h { z0.h, z1.h }, p0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
@ -151,7 +151,7 @@ define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st3h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
define void @st3h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, bfloat* %addr) #0 {
|
||||
; CHECK-LABEL: st3h_bf16:
|
||||
; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
@ -266,7 +266,7 @@ define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @st4h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, bfloat* %addr) {
|
||||
define void @st4h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, bfloat* %addr) #0 {
|
||||
; CHECK-LABEL: st4h_bf16:
|
||||
; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
@ -377,6 +377,16 @@ define void @stnt1h_f16(<vscale x 8 x half> %data, <vscale x 8 x i1> %pred, half
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @stnt1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %addr) #0 {
|
||||
; CHECK-LABEL: stnt1h_bf16:
|
||||
; CHECK: stnt1h { z0.h }, p0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x i1> %pred,
|
||||
bfloat* %addr)
|
||||
ret void
|
||||
}
|
||||
|
||||
;
|
||||
; STNT1W
|
||||
;
|
||||
@ -458,5 +468,9 @@ declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -179,6 +179,14 @@ define void @masked_store_nxv8f16(<vscale x 8 x half> *%a, <vscale x 8 x half> %
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_store_nxv8bf16(<vscale x 8 x bfloat> *%a, <vscale x 8 x bfloat> %val, <vscale x 8 x i1> %mask) nounwind #0 {
|
||||
; CHECK-LABEL: masked_store_nxv8bf16:
|
||||
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %val, <vscale x 8 x bfloat> *%a, i32 2, <vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
|
||||
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
|
||||
declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
|
||||
@ -203,6 +211,7 @@ declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>
|
||||
declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
|
||||
declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -513,6 +513,24 @@ define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind #0 {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8bf16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1
|
||||
%data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_load,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x bfloat> undef)
|
||||
%base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2
|
||||
call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x bfloat>* %base_store,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 8-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
|
||||
@ -596,6 +614,7 @@ declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i3
|
||||
declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
|
||||
declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
|
||||
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
|
||||
declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
|
||||
|
||||
; 16-element contiguous loads.
|
||||
declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
|
||||
@ -620,6 +639,10 @@ declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x floa
|
||||
declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
|
||||
|
||||
; 16-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -498,6 +498,24 @@ define void @test_masked_ldst_sv8f16(half * %base, <vscale x 8 x i1> %mask, i64
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8bf16(bfloat * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8bf16:
|
||||
; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%base_f16 = getelementptr bfloat, bfloat* %base, i64 %offset
|
||||
%base_addr = bitcast bfloat* %base_f16 to <vscale x 8 x bfloat>*
|
||||
%data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask,
|
||||
<vscale x 8 x bfloat> undef)
|
||||
call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x bfloat>* %base_addr,
|
||||
i32 1,
|
||||
<vscale x 8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 8-lane zero/sign extended contiguous loads.
|
||||
|
||||
define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
|
||||
@ -584,6 +602,7 @@ declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i3
|
||||
declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
|
||||
declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
|
||||
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
|
||||
declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
|
||||
|
||||
; 16-element contiguous loads.
|
||||
declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
|
||||
@ -608,6 +627,10 @@ declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x floa
|
||||
declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
|
||||
declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
|
||||
|
||||
; 16-element contiguous stores.
|
||||
declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -139,6 +139,23 @@ define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind #0 {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8bf16:
|
||||
; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
|
||||
; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ret
|
||||
%base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1
|
||||
%base_load_bc = bitcast <vscale x 8 x bfloat>* %base_load to bfloat*
|
||||
%data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask,
|
||||
bfloat* %base_load_bc)
|
||||
%base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2
|
||||
%base_store_bc = bitcast <vscale x 8 x bfloat>* %base_store to bfloat*
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x i1> %mask,
|
||||
bfloat* %base_store_bc)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 16-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
|
||||
@ -169,6 +186,7 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>,
|
||||
; 8-element non-temporal loads.
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
; 16-element non-temporal loads.
|
||||
declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*)
|
||||
@ -176,15 +194,18 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i
|
||||
; 2-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
|
||||
|
||||
; 4-element non-temporal stores.
|
||||
|
||||
; 4-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*)
|
||||
|
||||
; 8-element non-temporal stores.
|
||||
|
||||
; 8-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
|
||||
|
||||
; 16-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
@ -94,6 +94,20 @@ define void @test_masked_ldst_sv8f16(half* %base, <vscale x 8 x i1> %mask, i64 %
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_masked_ldst_sv8bf16(bfloat* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 {
|
||||
; CHECK-LABEL: test_masked_ldst_sv8bf16:
|
||||
; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
|
||||
; CHECK-NEXT: ret
|
||||
%gep = getelementptr bfloat, bfloat* %base, i64 %offset
|
||||
%data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask,
|
||||
bfloat* %gep)
|
||||
call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data,
|
||||
<vscale x 8 x i1> %mask,
|
||||
bfloat* %gep)
|
||||
ret void
|
||||
}
|
||||
|
||||
; 16-lane non-temporal load/stores.
|
||||
|
||||
define void @test_masked_ldst_sv16i8(i8* %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
|
||||
@ -121,6 +135,7 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>,
|
||||
; 8-element non-temporal loads.
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
|
||||
|
||||
; 16-element non-temporal loads.
|
||||
declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*)
|
||||
@ -128,14 +143,18 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i
|
||||
; 2-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
|
||||
|
||||
; 4-element non-temporal stores.
|
||||
|
||||
; 4-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*)
|
||||
|
||||
; 8-element non-temporal stores.
|
||||
|
||||
; 8-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
|
||||
|
||||
; 16-element non-temporal stores.
|
||||
declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
|
||||
|
||||
; +bf16 is required for the bfloat version.
|
||||
attributes #0 = { "target-features"="+sve,+bf16" }
|
||||
|
Loading…
Reference in New Issue
Block a user