mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 11:02:59 +02:00
AMDGPU: Add pass to expand memcpy/memmove/memset
llvm-svn: 294635
This commit is contained in:
parent
57ac23d53e
commit
478a09d3d1
@ -51,6 +51,10 @@ ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
|
||||
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
|
||||
extern char &AMDGPUAnnotateKernelFeaturesID;
|
||||
|
||||
ModulePass *createAMDGPULowerIntrinsicsPass();
|
||||
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
|
||||
extern char &AMDGPULowerIntrinsicsID;
|
||||
|
||||
void initializeSIFoldOperandsPass(PassRegistry &);
|
||||
extern char &SIFoldOperandsID;
|
||||
|
||||
|
@ -461,10 +461,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
||||
// N > 4 stores on the same chain.
|
||||
GatherAllAliasesMaxDepth = 16;
|
||||
|
||||
// FIXME: Need to really handle these.
|
||||
MaxStoresPerMemcpy = 4096;
|
||||
MaxStoresPerMemmove = 4096;
|
||||
MaxStoresPerMemset = 4096;
|
||||
// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
|
||||
// about these during lowering.
|
||||
MaxStoresPerMemcpy = 0xffffffff;
|
||||
MaxStoresPerMemmove = 0xffffffff;
|
||||
MaxStoresPerMemset = 0xffffffff;
|
||||
|
||||
setTargetDAGCombine(ISD::BITCAST);
|
||||
setTargetDAGCombine(ISD::SHL);
|
||||
|
123
lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
Normal file
123
lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
Normal file
@ -0,0 +1,123 @@
|
||||
//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
const unsigned MaxStaticSize = 1024;
|
||||
|
||||
class AMDGPULowerIntrinsics : public ModulePass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPULowerIntrinsics() : ModulePass(ID) { }
|
||||
bool runOnModule(Module &M) override;
|
||||
StringRef getPassName() const override {
|
||||
return "AMDGPU Lower Intrinsics";
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
char AMDGPULowerIntrinsics::ID = 0;
|
||||
|
||||
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
|
||||
|
||||
INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
|
||||
"Lower intrinsics", false, false)
|
||||
|
||||
// TODO: Should refine based on estimated number of accesses (e.g. does it
|
||||
// require splitting based on alignment)
|
||||
static bool shouldExpandOperationWithSize(Value *Size) {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
|
||||
return !CI || (CI->getZExtValue() > MaxStaticSize);
|
||||
}
|
||||
|
||||
static bool expandMemIntrinsicUses(Function &F) {
|
||||
Intrinsic::ID ID = F.getIntrinsicID();
|
||||
bool Changed;
|
||||
|
||||
for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
|
||||
Instruction *Inst = cast<Instruction>(*I);
|
||||
++I;
|
||||
|
||||
switch (ID) {
|
||||
case Intrinsic::memcpy: {
|
||||
auto *Memcpy = cast<MemCpyInst>(Inst);
|
||||
if (shouldExpandOperationWithSize(Memcpy->getLength())) {
|
||||
expandMemCpyAsLoop(Memcpy);
|
||||
Changed = true;
|
||||
Memcpy->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::memmove: {
|
||||
auto *Memmove = cast<MemMoveInst>(Inst);
|
||||
if (shouldExpandOperationWithSize(Memmove->getLength())) {
|
||||
expandMemMoveAsLoop(Memmove);
|
||||
Changed = true;
|
||||
Memmove->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::memset: {
|
||||
auto *Memset = cast<MemSetInst>(Inst);
|
||||
if (shouldExpandOperationWithSize(Memset->getLength())) {
|
||||
expandMemSetAsLoop(Memset);
|
||||
Changed = true;
|
||||
Memset->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
|
||||
bool Changed = false;
|
||||
|
||||
for (Function &F : M) {
|
||||
if (!F.isDeclaration())
|
||||
continue;
|
||||
|
||||
switch (F.getIntrinsicID()) {
|
||||
case Intrinsic::memcpy:
|
||||
case Intrinsic::memmove:
|
||||
case Intrinsic::memset:
|
||||
if (expandMemIntrinsicUses(F))
|
||||
Changed = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
|
||||
return new AMDGPULowerIntrinsics();
|
||||
}
|
@ -108,6 +108,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
|
||||
initializeSILoadStoreOptimizerPass(*PR);
|
||||
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
|
||||
initializeAMDGPUAnnotateUniformValuesPass(*PR);
|
||||
initializeAMDGPULowerIntrinsicsPass(*PR);
|
||||
initializeAMDGPUPromoteAllocaPass(*PR);
|
||||
initializeAMDGPUCodeGenPreparePass(*PR);
|
||||
initializeAMDGPUUnifyMetadataPass(*PR);
|
||||
@ -472,6 +473,8 @@ void AMDGPUPassConfig::addIRPasses() {
|
||||
disablePass(&FuncletLayoutID);
|
||||
disablePass(&PatchableFunctionID);
|
||||
|
||||
addPass(createAMDGPULowerIntrinsicsPass());
|
||||
|
||||
// Function calls are not supported, so make sure we inline everything.
|
||||
addPass(createAMDGPUAlwaysInlinePass());
|
||||
addPass(createAlwaysInlinerLegacyPass());
|
||||
|
@ -45,6 +45,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||
AMDGPUTargetObjectFile.cpp
|
||||
AMDGPUIntrinsicInfo.cpp
|
||||
AMDGPUISelDAGToDAG.cpp
|
||||
AMDGPULowerIntrinsics.cpp
|
||||
AMDGPUMCInstLower.cpp
|
||||
AMDGPUMachineFunction.cpp
|
||||
AMDGPUUnifyMetadata.cpp
|
||||
|
117
test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
Normal file
117
test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
Normal file
@ -0,0 +1,117 @@
|
||||
; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
|
||||
|
||||
declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
|
||||
declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
|
||||
|
||||
declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
|
||||
declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) #1
|
||||
|
||||
; Test the upper bound for sizes to leave
|
||||
; OPT-LABEL: @max_size_small_static_memcpy_caller0(
|
||||
; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
|
||||
define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Smallest static size which will be expanded
|
||||
; OPT-LABEL: @min_size_large_static_memcpy_caller0(
|
||||
; OPT-NOT: call
|
||||
; OPT: getelementptr
|
||||
; OPT-NEXT: load i8
|
||||
; OPT: getelementptr
|
||||
; OPT-NEXT: store i8
|
||||
define void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @max_size_small_static_memmove_caller0(
|
||||
; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
|
||||
define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
|
||||
call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @min_size_large_static_memmove_caller0(
|
||||
; OPT-NOT: call
|
||||
; OPT: getelementptr
|
||||
; OPT-NEXT: load i8
|
||||
; OPT: getelementptr
|
||||
; OPT-NEXT: store i8
|
||||
define void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
|
||||
call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @max_size_small_static_memset_caller0(
|
||||
; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
|
||||
define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
|
||||
call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @min_size_large_static_memset_caller0(
|
||||
; OPT-NOT: call
|
||||
; OPT: getelementptr
|
||||
; OPT: store i8
|
||||
define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
|
||||
call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @variable_memcpy_caller0(
|
||||
; OPT-NOT: call
|
||||
; OPT: phi
|
||||
define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @variable_memcpy_caller1(
|
||||
; OPT-NOT: call
|
||||
; OPT: phi
|
||||
define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @memcpy_multi_use_one_function(
|
||||
; OPT-NOT: call
|
||||
; OPT: phi
|
||||
; OPT-NOT: call
|
||||
; OPT: phi
|
||||
; OPT-NOT: call
|
||||
define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @memcpy_alt_type(
|
||||
; OPT: phi
|
||||
; OPT: getelementptr inbounds i8, i8 addrspace(3)*
|
||||
; OPT: load i8, i8 addrspace(3)*
|
||||
; OPT: getelementptr inbounds i8, i8 addrspace(1)*
|
||||
; OPT: store i8
|
||||
define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
|
||||
call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; One of the uses in the function should be expanded, the other left alone.
|
||||
; OPT-LABEL: @memcpy_multi_use_one_function_keep_small(
|
||||
; OPT: getelementptr inbounds i8, i8 addrspace(1)*
|
||||
; OPT: load i8, i8 addrspace(1)*
|
||||
; OPT: getelementptr inbounds i8, i8 addrspace(1)*
|
||||
; OPT: store i8
|
||||
|
||||
; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
|
||||
define void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
|
||||
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { argmemonly nounwind }
|
Loading…
Reference in New Issue
Block a user