1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 11:42:57 +01:00

[SVE][CodeGen] Fix DAGCombiner::ForwardStoreValueToDirectLoad for scalable vectors

In DAGCombiner::ForwardStoreValueToDirectLoad I have fixed up some
implicit casts from TypeSize -> uint64_t and replaced calls to
getVectorNumElements() with getVectorElementCount(). There are some
simple cases of forwarding that we can definitely support for
scalable vectors, i.e. when the store and load are both scalable
vectors and have the same size. I have added tests for the new
code paths here:

  CodeGen/AArch64/sve-forward-st-to-ld.ll

Differential Revision: https://reviews.llvm.org/D87098
This commit is contained in:
David Sherwood 2020-09-03 08:28:57 +01:00
parent 88c849a4b2
commit 4bc4e2e524
2 changed files with 135 additions and 10 deletions

View File

@ -14792,8 +14792,8 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
}
static inline int numVectorEltsOrZero(EVT T) {
return T.isVector() ? T.getVectorNumElements() : 0;
static inline ElementCount numVectorEltsOrZero(EVT T) {
return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
}
bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
@ -14861,6 +14861,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
EVT STMemType = ST->getMemoryVT();
EVT STType = ST->getValue().getValueType();
// There are two cases to consider here:
// 1. The store is fixed width and the load is scalable. In this case we
// don't know at compile time if the store completely envelops the load
// so we abandon the optimisation.
// 2. The store is scalable and the load is fixed width. We could
// potentially support a limited number of cases here, but there has been
// no cost-benefit analysis to prove it's worth it.
bool LdStScalable = LDMemType.isScalableVector();
if (LdStScalable != STMemType.isScalableVector())
return SDValue();
// If we are dealing with scalable vectors on a big endian platform the
// calculation of offsets below becomes trickier, since we do not know at
// compile time the absolute size of the vector. Until we've done more
// analysis on big-endian platforms it seems better to bail out for now.
if (LdStScalable && DAG.getDataLayout().isBigEndian())
return SDValue();
BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
int64_t Offset;
@ -14872,13 +14890,21 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
// the stored value). With Offset=n (for n > 0) the loaded value starts at the
// n:th least significant byte of the stored value.
if (DAG.getDataLayout().isBigEndian())
Offset = ((int64_t)STMemType.getStoreSizeInBits() -
(int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;
Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
(int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
8 -
Offset;
// Check that the stored value cover all bits that are loaded.
bool STCoversLD =
(Offset >= 0) &&
(Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
bool STCoversLD;
TypeSize LdMemSize = LDMemType.getSizeInBits();
TypeSize StMemSize = STMemType.getSizeInBits();
if (LdStScalable)
STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
else
STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
StMemSize.getFixedSize());
auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
if (LD->isIndexed()) {
@ -14899,15 +14925,15 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
// Memory as copy space (potentially masked).
if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
// Simple case: Direct non-truncating forwarding
if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
if (LDType.getSizeInBits() == LdMemSize)
return ReplaceLd(LD, ST->getValue(), Chain);
// Can we model the truncate and extension with an and mask?
if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
!LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
// Mask to size of LDMemType
auto Mask =
DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
STMemType.getSizeInBits()),
DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
StMemSize.getFixedSize()),
SDLoc(ST), STType);
auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
return ReplaceLd(LD, Val, Chain);

View File

@ -0,0 +1,99 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning
define <vscale x 2 x i64> @sti64ldi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
; CHECK-LABEL: sti64ldi64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ret
entry:
%arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
%arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
%0 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
ret <vscale x 2 x i64> %0
}
define <vscale x 2 x double> @stf64ldf64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
; CHECK-LABEL: stf64ldf64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ret
entry:
%arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
%arrayidx1 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
%0 = load <vscale x 2 x double>, <vscale x 2 x double>* %arrayidx1
ret <vscale x 2 x double> %0
}
define <vscale x 2 x i64> @sti32ldi32ext(<vscale x 2 x i32>* nocapture %P, <vscale x 2 x i64> %v) {
; CHECK-LABEL: sti32ldi32ext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z1.d, p0/m, z0.d
; CHECK-NEXT: st1w { z0.d }, p0, [x0]
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
entry:
%0 = trunc <vscale x 2 x i64> %v to <vscale x 2 x i32>
store <vscale x 2 x i32> %0, <vscale x 2 x i32>* %P
%1 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %P
%2 = sext <vscale x 2 x i32> %1 to <vscale x 2 x i64>
ret <vscale x 2 x i64> %2
}
define <2 x i64> @sti64ldfixedi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
; CHECK-LABEL: sti64ldfixedi64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: ret
entry:
%arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
%arrayidx1 = bitcast <vscale x 2 x i64>* %arrayidx0 to <2 x i64>*
%0 = load <2 x i64>, <2 x i64>* %arrayidx1
ret <2 x i64> %0
}
define <vscale x 4 x i32> @sti64ldi32(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
; CHECK-LABEL: sti64ldi32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
entry:
%0 = bitcast <vscale x 2 x i64>* %P to <vscale x 4 x i32>*
%arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
%arrayidx1 = getelementptr inbounds <vscale x 4 x i32>, <vscale x 4 x i32>* %0, i64 1
%1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %arrayidx1
ret <vscale x 4 x i32> %1
}
define <vscale x 2 x i64> @stf64ldi64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
; CHECK-LABEL: stf64ldi64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
entry:
%0 = bitcast <vscale x 2 x double>* %P to <vscale x 2 x i64>*
%arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
%arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %0, i64 1
%1 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
ret <vscale x 2 x i64> %1
}