1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[SVE][CodeGen] Legalisation of masked loads and stores

Summary:
This patch modifies IncrementMemoryAddress to use a vscale
when calculating the new address if the data type is scalable.

Also adds tablegen patterns which match an extract_subvector
of a legal predicate type with zip1/zip2 instructions

Reviewers: sdesmalen, efriedma, david-arm

Reviewed By: efriedma, david-arm

Subscribers: tschuett, hiraditya, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D83137
This commit is contained in:
Kerry McLaughlin 2020-07-16 10:12:41 +01:00
parent aa7315a8f1
commit 30eb603e95
4 changed files with 184 additions and 3 deletions

View File

@ -7153,6 +7153,9 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
assert(DataVT.getVectorNumElements() == MaskVT.getVectorNumElements() &&
"Incompatible types of Data and Mask");
if (IsCompressedMemory) {
if (DataVT.isScalableVector())
report_fatal_error(
"Cannot currently handle compressed memory with scalable vectors");
// Incrementing the pointer according to number of '1's in the mask.
EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
@ -7168,6 +7171,10 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
AddrVT);
Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
} else if (DataVT.isScalableVector()) {
Increment = DAG.getVScale(DL, AddrVT,
APInt(AddrVT.getSizeInBits().getFixedSize(),
DataVT.getStoreSize().getKnownMinSize()));
} else
Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);

View File

@ -1109,6 +1109,20 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
// Extract lo/hi halves of legal predicate types.
def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
(ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
(ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
(ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
(ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
(ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
(ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;

View File

@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
; LOAD
; UNPREDICATED
define <vscale x 4 x i16> @load_promote_4i8(<vscale x 4 x i16>* %a) {
; CHECK-LABEL: load_promote_4i8:
define <vscale x 4 x i16> @load_promote_4i16(<vscale x 4 x i16>* %a) {
; CHECK-LABEL: load_promote_4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
@ -53,3 +53,82 @@ define <vscale x 16 x i64> @load_split_16i64(<vscale x 16 x i64>* %a) {
%load = load <vscale x 16 x i64>, <vscale x 16 x i64>* %a
ret <vscale x 16 x i64> %load
}
; MASKED
define <vscale x 2 x i32> @masked_load_promote_2i32(<vscale x 2 x i32> *%a, <vscale x 2 x i1> %pg) {
; CHECK-LABEL: masked_load_promote_2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32> *%a, i32 1, <vscale x 2 x i1> %pg, <vscale x 2 x i32> undef)
ret <vscale x 2 x i32> %load
}
define <vscale x 32 x i8> @masked_load_split_32i8(<vscale x 32 x i8> *%a, <vscale x 32 x i1> %pg) {
; CHECK-LABEL: masked_load_split_32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8> *%a, i32 1, <vscale x 32 x i1> %pg, <vscale x 32 x i8> undef)
ret <vscale x 32 x i8> %load
}
define <vscale x 32 x i16> @masked_load_split_32i16(<vscale x 32 x i16> *%a, <vscale x 32 x i1> %pg) {
; CHECK-LABEL: masked_load_split_32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p2.b
; CHECK-NEXT: zip1 p3.b, p0.b, p2.b
; CHECK-NEXT: zip2 p0.b, p0.b, p2.b
; CHECK-NEXT: ld1h { z0.h }, p3/z, [x0]
; CHECK-NEXT: zip1 p3.b, p1.b, p2.b
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: zip2 p0.b, p1.b, p2.b
; CHECK-NEXT: ld1h { z2.h }, p3/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, #3, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16> *%a, i32 1, <vscale x 32 x i1> %pg, <vscale x 32 x i16> undef)
ret <vscale x 32 x i16> %load
}
define <vscale x 8 x i32> @masked_load_split_8i32(<vscale x 8 x i32> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_load_split_8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i32> undef)
ret <vscale x 8 x i32> %load
}
define <vscale x 8 x i64> @masked_load_split_8i64(<vscale x 8 x i64> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_load_split_8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
; CHECK-NEXT: zip1 p3.s, p2.s, p1.s
; CHECK-NEXT: zip2 p2.s, p2.s, p1.s
; CHECK-NEXT: ld1d { z0.d }, p3/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p2/z, [x0, #1, mul vl]
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i64> undef)
ret <vscale x 8 x i64> %load
}
declare <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8>*, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
declare <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16>*, i32, <vscale x 32 x i1>, <vscale x 32 x i16>)
declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
declare <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32>*, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
declare <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64>*, i32, <vscale x 8 x i1>, <vscale x 8 x i64>)

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
; UNPREDICATED
define void @store_promote_4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8>* %a) {
; CHECK-LABEL: store_promote_4i8:
; CHECK: // %bb.0:
@ -51,3 +53,82 @@ define void @store_split_16i64(<vscale x 16 x i64> %data, <vscale x 16 x i64>* %
store <vscale x 16 x i64> %data, <vscale x 16 x i64>* %a
ret void
}
; MASKED
define void @masked_store_promote_2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8> *%a, <vscale x 2 x i1> %pg) {
; CHECK-LABEL: masked_store_promote_2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: st1b { z0.d }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8> *%a, i32 1, <vscale x 2 x i1> %pg)
ret void
}
define void @masked_store_split_32i8(<vscale x 32 x i8> %data, <vscale x 32 x i8> *%a, <vscale x 32 x i1> %pg) {
; CHECK-LABEL: masked_store_split_32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: st1b { z1.b }, p1, [x0, #1, mul vl]
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv32i8(<vscale x 32 x i8> %data, <vscale x 32 x i8> *%a, i32 1, <vscale x 32 x i1> %pg)
ret void
}
define void @masked_store_split_32i16(<vscale x 32 x i16> %data, <vscale x 32 x i16> *%a, <vscale x 32 x i1> %pg) {
; CHECK-LABEL: masked_store_split_32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p2.b
; CHECK-NEXT: zip2 p3.b, p1.b, p2.b
; CHECK-NEXT: zip1 p1.b, p1.b, p2.b
; CHECK-NEXT: st1h { z3.h }, p3, [x0, #3, mul vl]
; CHECK-NEXT: zip2 p3.b, p0.b, p2.b
; CHECK-NEXT: zip1 p0.b, p0.b, p2.b
; CHECK-NEXT: st1h { z2.h }, p1, [x0, #2, mul vl]
; CHECK-NEXT: st1h { z1.h }, p3, [x0, #1, mul vl]
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv32i16(<vscale x 32 x i16> %data, <vscale x 32 x i16> *%a, i32 1, <vscale x 32 x i1> %pg)
ret void
}
define void @masked_store_split_8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_store_split_8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
; CHECK-NEXT: st1w { z1.s }, p2, [x0, #1, mul vl]
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32> *%a, i32 1, <vscale x 8 x i1> %pg)
ret void
}
define void @masked_store_split_8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: masked_store_split_8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
; CHECK-NEXT: zip2 p3.s, p2.s, p1.s
; CHECK-NEXT: zip1 p2.s, p2.s, p1.s
; CHECK-NEXT: st1d { z2.d }, p2, [x0, #2, mul vl]
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: st1d { z3.d }, p3, [x0, #3, mul vl]
; CHECK-NEXT: st1d { z1.d }, p2, [x0, #1, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg)
ret void
}
declare void @llvm.masked.store.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>*, i32, <vscale x 2 x i1>)
declare void @llvm.masked.store.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>*, i32, <vscale x 32 x i1>)
declare void @llvm.masked.store.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>*, i32, <vscale x 32 x i1>)
declare void @llvm.masked.store.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>*, i32, <vscale x 8 x i1>)
declare void @llvm.masked.store.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>*, i32, <vscale x 8 x i1>)