1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00

[NVPTX] Avoid temp copy of byval kernel parameters.

Avoid making a temporary copy of byval argument if all accesses are loads and
therefore the pointer to the parameter can not escape.

This avoids excessive global memory accesses when each kernel makes its own
copy.

Differential revision: https://reviews.llvm.org/D98469
This commit is contained in:
Artem Belevich 2021-03-04 18:34:39 -08:00
parent 40de5cf96a
commit 7a17da6eb6
4 changed files with 167 additions and 2 deletions

View File

@ -140,6 +140,7 @@ INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
// =============================================================================
// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
// and we can't guarantee that the only accesses are loads,
// then add the following instructions to the first basic block:
//
// %temp = alloca %struct.x, align 8
@ -150,7 +151,57 @@ INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
// The above code allocates some space in the stack and copies the incoming
// struct from param space to local space.
// Then replace all occurrences of %d by %temp.
//
// In case we know that all users are GEPs or Loads, replace them with the same
// ones in parameter AS, so we can access them using ld.param.
// =============================================================================
// Replaces the \p OldUser instruction with the same in parameter AS.
// Only Load and GEP are supported.
static void convertToParamAS(Value *OldUser, Value *Param) {
Instruction *I = dyn_cast<Instruction>(OldUser);
assert(I && "OldUser must be an instruction");
struct IP {
Instruction *OldInstruction;
Value *NewParam;
};
SmallVector<IP> ItemsToConvert = {{I, Param}};
SmallVector<GetElementPtrInst *> GEPsToDelete;
while (!ItemsToConvert.empty()) {
IP I = ItemsToConvert.pop_back_val();
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction))
LI->setOperand(0, I.NewParam);
else if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
SmallVector<Value *, 4> Indices(GEP->indices());
auto *NewGEP = GetElementPtrInst::Create(nullptr, I.NewParam, Indices,
GEP->getName(), GEP);
NewGEP->setIsInBounds(GEP->isInBounds());
llvm::for_each(GEP->users(), [NewGEP, &ItemsToConvert](Value *V) {
ItemsToConvert.push_back({cast<Instruction>(V), NewGEP});
});
GEPsToDelete.push_back(GEP);
} else
llvm_unreachable("Only Load and GEP can be converted to param AS.");
}
llvm::for_each(GEPsToDelete,
[](GetElementPtrInst *GEP) { GEP->eraseFromParent(); });
}
static bool isALoadChain(Value *Start) {
SmallVector<Value *, 16> ValuesToCheck = {Start};
while (!ValuesToCheck.empty()) {
Value *V = ValuesToCheck.pop_back_val();
Instruction *I = dyn_cast<Instruction>(V);
if (!I)
return false;
if (isa<GetElementPtrInst>(I))
ValuesToCheck.append(I->user_begin(), I->user_end());
else if (!isa<LoadInst>(I))
return false;
}
return true;
};
void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
Function *Func = Arg->getParent();
Instruction *FirstInst = &(Func->getEntryBlock().front());
@ -159,6 +210,21 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
assert(PType && "Expecting pointer type in handleByValParam");
Type *StructType = PType->getElementType();
if (llvm::all_of(Arg->users(), isALoadChain)) {
// Replace all loads with the loads in param AS. This allows loading the Arg
// directly from parameter AS, without making a temporary copy.
SmallVector<User *, 16> UsersToUpdate(Arg->users());
Value *ArgInParamAS = new AddrSpaceCastInst(
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
convertToParamAS(V, ArgInParamAS);
});
return;
}
// Otherwise we have to create a temporary copy.
const DataLayout &DL = Func->getParent()->getDataLayout();
unsigned AS = DL.getAllocaAddrSpace();
AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);

View File

@ -23,5 +23,12 @@ entry:
%arg.idx.val.val = load i32, i32* %arg.idx.val, align 4
%add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
store i32 %add.i, i32* %arg.idx1.val, align 4
; let the pointer escape so we still create a local copy this test uses to
; check the load alignment.
%tmp = call i32* @escape(i32* nonnull %arg.idx2)
ret void
}
; Function Attrs: convergent nounwind
declare dso_local i32* @escape(i32*) local_unnamed_addr

View File

@ -0,0 +1,92 @@
; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
target triple = "nvptx64-nvidia-cuda"
%struct.ham = type { [4 x i32] }
; // Verify that load with static offset into parameter is done directly.
; CHECK-LABEL: .visible .entry static_offset
; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12];
; CHECK st.global.u32 [[[result_addr_g]]], [[value]];
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
bb:
%tmp = icmp eq i32 %arg2, 3
br i1 %tmp, label %bb3, label %bb6
bb3: ; preds = %bb
%tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3
%tmp5 = load i32, i32* %tmp4, align 4
store i32 %tmp5, i32* %arg, align 4
br label %bb6
bb6: ; preds = %bb3, %bb
ret void
}
; // Verify that load with dynamic offset into parameter is also done directly.
; CHECK-LABEL: .visible .entry dynamic_offset
; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
; CHECK: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]];
; CHECK st.global.u32 [[[result_addr_g]]], [[value]];
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
%tmp4 = load i32, i32* %tmp3, align 4
store i32 %tmp4, i32* %arg, align 4
ret void
}
; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
; CHECK-LABEL: .visible .entry pointer_escapes
; CHECK: .local .align 8 .b8 __local_depot{{.*}}
; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
; CHECK: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0;
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12];
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8];
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4];
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1];
; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12],
; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8],
; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4],
; CHECK-DAG: st.local.u32 [%[[copy_addr]]],
; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
; CHECK: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]];
; CHECK st.global.u32 [[[result_addr_g]]], [[value]];
; Function Attrs: convergent norecurse nounwind mustprogress
define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
%tmp4 = load i32, i32* %tmp3, align 4
store i32 %tmp4, i32* %arg, align 4
%tmp5 = call i32* @escape(i32* nonnull %tmp3) #3
ret void
}
; Function Attrs: convergent nounwind
declare dso_local i32* @escape(i32*) local_unnamed_addr
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1}
!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1}
!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}

View File

@ -35,7 +35,7 @@ define void @ptr_in_byval_kernel(%struct.S* byval(%struct.S) %input, i32* %outpu
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 4
%b = load i32*, i32** %b_ptr, align 8
%v = load i32, i32* %b, align 4
; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
store i32 %v, i32* %output, align 4
@ -51,7 +51,7 @@ define void @ptr_in_byval_func(%struct.S* byval(%struct.S) %input, i32* %output)
; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1]
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 4
%b = load i32*, i32** %b_ptr, align 8
%v = load i32, i32* %b, align 4
; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
store i32 %v, i32* %output, align 4