From 4bc4e2e52495c23e3a88f83811e8f6a41404a535 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 3 Sep 2020 08:28:57 +0100 Subject: [PATCH] [SVE][CodeGen] Fix DAGCombiner::ForwardStoreValueToDirectLoad for scalable vectors In DAGCombiner::ForwardStoreValueToDirectLoad I have fixed up some implicit casts from TypeSize -> uint64_t and replaced calls to getVectorNumElements() with getVectorElementCount(). There are some simple cases of forwarding that we can definitely support for scalable vectors, i.e. when the store and load are both scalable vectors and have the same size. I have added tests for the new code paths here: CodeGen/AArch64/sve-forward-st-to-ld.ll Differential Revision: https://reviews.llvm.org/D87098 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 46 +++++++-- test/CodeGen/AArch64/sve-forward-st-to-ld.ll | 99 ++++++++++++++++++++ 2 files changed, 135 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/AArch64/sve-forward-st-to-ld.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f67a06f81c5..c36a085599b 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14792,8 +14792,8 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } -static inline int numVectorEltsOrZero(EVT T) { - return T.isVector() ? T.getVectorNumElements() : 0; +static inline ElementCount numVectorEltsOrZero(EVT T) { + return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0); } bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { @@ -14861,6 +14861,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { EVT STMemType = ST->getMemoryVT(); EVT STType = ST->getValue().getValueType(); + // There are two cases to consider here: + // 1. The store is fixed width and the load is scalable. In this case we + // don't know at compile time if the store completely envelops the load + // so we abandon the optimisation. + // 2. The store is scalable and the load is fixed width. We could + // potentially support a limited number of cases here, but there has been + // no cost-benefit analysis to prove it's worth it. + bool LdStScalable = LDMemType.isScalableVector(); + if (LdStScalable != STMemType.isScalableVector()) + return SDValue(); + + // If we are dealing with scalable vectors on a big endian platform the + // calculation of offsets below becomes trickier, since we do not know at + // compile time the absolute size of the vector. Until we've done more + // analysis on big-endian platforms it seems better to bail out for now. + if (LdStScalable && DAG.getDataLayout().isBigEndian()) + return SDValue(); + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); int64_t Offset; @@ -14872,13 +14890,21 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // the stored value). With Offset=n (for n > 0) the loaded value starts at the // n:th least significant byte of the stored value. if (DAG.getDataLayout().isBigEndian()) - Offset = ((int64_t)STMemType.getStoreSizeInBits() - - (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset; + Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() - + (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) / + 8 - + Offset; // Check that the stored value cover all bits that are loaded. - bool STCoversLD = - (Offset >= 0) && - (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); + bool STCoversLD; + + TypeSize LdMemSize = LDMemType.getSizeInBits(); + TypeSize StMemSize = STMemType.getSizeInBits(); + if (LdStScalable) + STCoversLD = (Offset == 0) && LdMemSize == StMemSize; + else + STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <= + StMemSize.getFixedSize()); auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { if (LD->isIndexed()) { @@ -14899,15 +14925,15 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // Memory as copy space (potentially masked). if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding - if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) + if (LDType.getSizeInBits() == LdMemSize) return ReplaceLd(LD, ST->getValue(), Chain); // Can we model the truncate and extension with an and mask? if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { // Mask to size of LDMemType auto Mask = - DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(), - STMemType.getSizeInBits()), + DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(), + StMemSize.getFixedSize()), SDLoc(ST), STType); auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); return ReplaceLd(LD, Val, Chain); diff --git a/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/test/CodeGen/AArch64/sve-forward-st-to-ld.ll new file mode 100644 index 00000000000..b38f7da73ea --- /dev/null +++ b/test/CodeGen/AArch64/sve-forward-st-to-ld.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define @sti64ldi64(* nocapture %P, %v) { +; CHECK-LABEL: sti64ldi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %P, i64 1 + %0 = load , * %arrayidx1 + ret %0 +} + +define @stf64ldf64(* nocapture %P, %v) { +; CHECK-LABEL: stf64ldf64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %P, i64 1 + %0 = load , * %arrayidx1 + ret %0 +} + +define @sti32ldi32ext(* nocapture %P, %v) { +; CHECK-LABEL: sti32ldi32ext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z1.d, p0/m, z0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %0 = trunc %v to + store %0, * %P + %1 = load , * %P + %2 = sext %1 to + ret %2 +} + +define <2 x i64> @sti64ldfixedi64(* nocapture %P, %v) { +; CHECK-LABEL: sti64ldfixedi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: ret +entry: + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = bitcast * %arrayidx0 to <2 x i64>* + %0 = load <2 x i64>, <2 x i64>* %arrayidx1 + ret <2 x i64> %0 +} + +define @sti64ldi32(* nocapture %P, %v) { +; CHECK-LABEL: sti64ldi32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %0 = bitcast * %P to * + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %0, i64 1 + %1 = load , * %arrayidx1 + ret %1 +} + +define @stf64ldi64(* nocapture %P, %v) { +; CHECK-LABEL: stf64ldi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %0 = bitcast * %P to * + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %0, i64 1 + %1 = load , * %arrayidx1 + ret %1 +}