mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[NVPTX] Avoid temp copy of byval kernel parameters.
Avoid making a temporary copy of byval argument if all accesses are loads and therefore the pointer to the parameter can not escape. This avoids excessive global memory accesses when each kernel makes its own copy. Differential revision: https://reviews.llvm.org/D98469
This commit is contained in:
parent
40de5cf96a
commit
7a17da6eb6
@ -140,6 +140,7 @@ INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
|
||||
|
||||
// =============================================================================
|
||||
// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
|
||||
// and we can't guarantee that the only accesses are loads,
|
||||
// then add the following instructions to the first basic block:
|
||||
//
|
||||
// %temp = alloca %struct.x, align 8
|
||||
@ -150,7 +151,57 @@ INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
|
||||
// The above code allocates some space in the stack and copies the incoming
|
||||
// struct from param space to local space.
|
||||
// Then replace all occurrences of %d by %temp.
|
||||
//
|
||||
// In case we know that all users are GEPs or Loads, replace them with the same
|
||||
// ones in parameter AS, so we can access them using ld.param.
|
||||
// =============================================================================
|
||||
|
||||
// Replaces the \p OldUser instruction with the same in parameter AS.
|
||||
// Only Load and GEP are supported.
|
||||
static void convertToParamAS(Value *OldUser, Value *Param) {
|
||||
Instruction *I = dyn_cast<Instruction>(OldUser);
|
||||
assert(I && "OldUser must be an instruction");
|
||||
struct IP {
|
||||
Instruction *OldInstruction;
|
||||
Value *NewParam;
|
||||
};
|
||||
SmallVector<IP> ItemsToConvert = {{I, Param}};
|
||||
SmallVector<GetElementPtrInst *> GEPsToDelete;
|
||||
while (!ItemsToConvert.empty()) {
|
||||
IP I = ItemsToConvert.pop_back_val();
|
||||
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction))
|
||||
LI->setOperand(0, I.NewParam);
|
||||
else if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
|
||||
SmallVector<Value *, 4> Indices(GEP->indices());
|
||||
auto *NewGEP = GetElementPtrInst::Create(nullptr, I.NewParam, Indices,
|
||||
GEP->getName(), GEP);
|
||||
NewGEP->setIsInBounds(GEP->isInBounds());
|
||||
llvm::for_each(GEP->users(), [NewGEP, &ItemsToConvert](Value *V) {
|
||||
ItemsToConvert.push_back({cast<Instruction>(V), NewGEP});
|
||||
});
|
||||
GEPsToDelete.push_back(GEP);
|
||||
} else
|
||||
llvm_unreachable("Only Load and GEP can be converted to param AS.");
|
||||
}
|
||||
llvm::for_each(GEPsToDelete,
|
||||
[](GetElementPtrInst *GEP) { GEP->eraseFromParent(); });
|
||||
}
|
||||
|
||||
static bool isALoadChain(Value *Start) {
|
||||
SmallVector<Value *, 16> ValuesToCheck = {Start};
|
||||
while (!ValuesToCheck.empty()) {
|
||||
Value *V = ValuesToCheck.pop_back_val();
|
||||
Instruction *I = dyn_cast<Instruction>(V);
|
||||
if (!I)
|
||||
return false;
|
||||
if (isa<GetElementPtrInst>(I))
|
||||
ValuesToCheck.append(I->user_begin(), I->user_end());
|
||||
else if (!isa<LoadInst>(I))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
|
||||
Function *Func = Arg->getParent();
|
||||
Instruction *FirstInst = &(Func->getEntryBlock().front());
|
||||
@ -159,6 +210,21 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
|
||||
assert(PType && "Expecting pointer type in handleByValParam");
|
||||
|
||||
Type *StructType = PType->getElementType();
|
||||
|
||||
if (llvm::all_of(Arg->users(), isALoadChain)) {
|
||||
// Replace all loads with the loads in param AS. This allows loading the Arg
|
||||
// directly from parameter AS, without making a temporary copy.
|
||||
SmallVector<User *, 16> UsersToUpdate(Arg->users());
|
||||
Value *ArgInParamAS = new AddrSpaceCastInst(
|
||||
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
|
||||
FirstInst);
|
||||
llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
|
||||
convertToParamAS(V, ArgInParamAS);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise we have to create a temporary copy.
|
||||
const DataLayout &DL = Func->getParent()->getDataLayout();
|
||||
unsigned AS = DL.getAllocaAddrSpace();
|
||||
AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
|
||||
|
@ -23,5 +23,12 @@ entry:
|
||||
%arg.idx.val.val = load i32, i32* %arg.idx.val, align 4
|
||||
%add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
|
||||
store i32 %add.i, i32* %arg.idx1.val, align 4
|
||||
|
||||
; let the pointer escape so we still create a local copy this test uses to
|
||||
; check the load alignment.
|
||||
%tmp = call i32* @escape(i32* nonnull %arg.idx2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare dso_local i32* @escape(i32*) local_unnamed_addr
|
||||
|
92
test/CodeGen/NVPTX/lower-byval-args.ll
Normal file
92
test/CodeGen/NVPTX/lower-byval-args.ll
Normal file
@ -0,0 +1,92 @@
|
||||
; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.ham = type { [4 x i32] }
|
||||
|
||||
; // Verify that load with static offset into parameter is done directly.
|
||||
; CHECK-LABEL: .visible .entry static_offset
|
||||
; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
|
||||
; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
|
||||
; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
|
||||
; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
|
||||
; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12];
|
||||
; CHECK st.global.u32 [[[result_addr_g]]], [[value]];
|
||||
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
|
||||
define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
|
||||
bb:
|
||||
%tmp = icmp eq i32 %arg2, 3
|
||||
br i1 %tmp, label %bb3, label %bb6
|
||||
|
||||
bb3: ; preds = %bb
|
||||
%tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3
|
||||
%tmp5 = load i32, i32* %tmp4, align 4
|
||||
store i32 %tmp5, i32* %arg, align 4
|
||||
br label %bb6
|
||||
|
||||
bb6: ; preds = %bb3, %bb
|
||||
ret void
|
||||
}
|
||||
|
||||
; // Verify that load with dynamic offset into parameter is also done directly.
|
||||
; CHECK-LABEL: .visible .entry dynamic_offset
|
||||
; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
|
||||
; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
|
||||
; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
|
||||
; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
|
||||
; CHECK: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
|
||||
; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]];
|
||||
; CHECK st.global.u32 [[[result_addr_g]]], [[value]];
|
||||
|
||||
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
|
||||
define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
|
||||
bb:
|
||||
%tmp = sext i32 %arg2 to i64
|
||||
%tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
|
||||
%tmp4 = load i32, i32* %tmp3, align 4
|
||||
store i32 %tmp4, i32* %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
|
||||
; CHECK-LABEL: .visible .entry pointer_escapes
|
||||
; CHECK: .local .align 8 .b8 __local_depot{{.*}}
|
||||
; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
|
||||
; CHECK: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0;
|
||||
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12];
|
||||
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8];
|
||||
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4];
|
||||
; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1];
|
||||
; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12],
|
||||
; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8],
|
||||
; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4],
|
||||
; CHECK-DAG: st.local.u32 [%[[copy_addr]]],
|
||||
; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
|
||||
; CHECK: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
|
||||
; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]];
|
||||
; CHECK st.global.u32 [[[result_addr_g]]], [[value]];
|
||||
|
||||
; Function Attrs: convergent norecurse nounwind mustprogress
|
||||
define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
|
||||
bb:
|
||||
%tmp = sext i32 %arg2 to i64
|
||||
%tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
|
||||
%tmp4 = load i32, i32* %tmp3, align 4
|
||||
store i32 %tmp4, i32* %arg, align 4
|
||||
%tmp5 = call i32* @escape(i32* nonnull %tmp3) #3
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare dso_local i32* @escape(i32*) local_unnamed_addr
|
||||
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1}
|
||||
!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1}
|
||||
!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}
|
@ -35,7 +35,7 @@ define void @ptr_in_byval_kernel(%struct.S* byval(%struct.S) %input, i32* %outpu
|
||||
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
|
||||
; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
|
||||
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
|
||||
%b = load i32*, i32** %b_ptr, align 4
|
||||
%b = load i32*, i32** %b_ptr, align 8
|
||||
%v = load i32, i32* %b, align 4
|
||||
; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
|
||||
store i32 %v, i32* %output, align 4
|
||||
@ -51,7 +51,7 @@ define void @ptr_in_byval_func(%struct.S* byval(%struct.S) %input, i32* %output)
|
||||
; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1]
|
||||
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
|
||||
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
|
||||
%b = load i32*, i32** %b_ptr, align 4
|
||||
%b = load i32*, i32** %b_ptr, align 8
|
||||
%v = load i32, i32* %b, align 4
|
||||
; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
|
||||
store i32 %v, i32* %output, align 4
|
||||
|
Loading…
Reference in New Issue
Block a user