1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

AMDGPU: Add pass to expand memcpy/memmove/memset

llvm-svn: 294635
This commit is contained in:
Matt Arsenault 2017-02-09 22:00:42 +00:00
parent 57ac23d53e
commit 478a09d3d1
6 changed files with 253 additions and 4 deletions

View File

@ -51,6 +51,10 @@ ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
void initializeSIFoldOperandsPass(PassRegistry &);
extern char &SIFoldOperandsID;

View File

@ -461,10 +461,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// N > 4 stores on the same chain.
GatherAllAliasesMaxDepth = 16;
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
MaxStoresPerMemmove = 4096;
MaxStoresPerMemset = 4096;
// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
// about these during lowering.
MaxStoresPerMemcpy = 0xffffffff;
MaxStoresPerMemmove = 0xffffffff;
MaxStoresPerMemset = 0xffffffff;
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::SHL);

View File

@ -0,0 +1,123 @@
//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
using namespace llvm;
namespace {
const unsigned MaxStaticSize = 1024;
class AMDGPULowerIntrinsics : public ModulePass {
public:
static char ID;
AMDGPULowerIntrinsics() : ModulePass(ID) { }
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return "AMDGPU Lower Intrinsics";
}
};
}
char AMDGPULowerIntrinsics::ID = 0;
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
"Lower intrinsics", false, false)
// TODO: Should refine based on estimated number of accesses (e.g. does it
// require splitting based on alignment)
static bool shouldExpandOperationWithSize(Value *Size) {
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
return !CI || (CI->getZExtValue() > MaxStaticSize);
}
static bool expandMemIntrinsicUses(Function &F) {
Intrinsic::ID ID = F.getIntrinsicID();
bool Changed;
for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
Instruction *Inst = cast<Instruction>(*I);
++I;
switch (ID) {
case Intrinsic::memcpy: {
auto *Memcpy = cast<MemCpyInst>(Inst);
if (shouldExpandOperationWithSize(Memcpy->getLength())) {
expandMemCpyAsLoop(Memcpy);
Changed = true;
Memcpy->eraseFromParent();
}
break;
}
case Intrinsic::memmove: {
auto *Memmove = cast<MemMoveInst>(Inst);
if (shouldExpandOperationWithSize(Memmove->getLength())) {
expandMemMoveAsLoop(Memmove);
Changed = true;
Memmove->eraseFromParent();
}
break;
}
case Intrinsic::memset: {
auto *Memset = cast<MemSetInst>(Inst);
if (shouldExpandOperationWithSize(Memset->getLength())) {
expandMemSetAsLoop(Memset);
Changed = true;
Memset->eraseFromParent();
}
break;
}
default:
break;
}
}
return Changed;
}
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
bool Changed = false;
for (Function &F : M) {
if (!F.isDeclaration())
continue;
switch (F.getIntrinsicID()) {
case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset:
if (expandMemIntrinsicUses(F))
Changed = true;
break;
default:
break;
}
}
return Changed;
}
ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
return new AMDGPULowerIntrinsics();
}

View File

@ -108,6 +108,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
@ -472,6 +473,8 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());

View File

@ -45,6 +45,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUTargetObjectFile.cpp
AMDGPUIntrinsicInfo.cpp
AMDGPUISelDAGToDAG.cpp
AMDGPULowerIntrinsics.cpp
AMDGPUMCInstLower.cpp
AMDGPUMachineFunction.cpp
AMDGPUUnifyMetadata.cpp

View File

@ -0,0 +1,117 @@
; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) #1
; Test the upper bound for sizes to leave
; OPT-LABEL: @max_size_small_static_memcpy_caller0(
; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
ret void
}
; Smallest static size which will be expanded
; OPT-LABEL: @min_size_large_static_memcpy_caller0(
; OPT-NOT: call
; OPT: getelementptr
; OPT-NEXT: load i8
; OPT: getelementptr
; OPT-NEXT: store i8
define void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
ret void
}
; OPT-LABEL: @max_size_small_static_memmove_caller0(
; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
ret void
}
; OPT-LABEL: @min_size_large_static_memmove_caller0(
; OPT-NOT: call
; OPT: getelementptr
; OPT-NEXT: load i8
; OPT: getelementptr
; OPT-NEXT: store i8
define void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
ret void
}
; OPT-LABEL: @max_size_small_static_memset_caller0(
; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
ret void
}
; OPT-LABEL: @min_size_large_static_memset_caller0(
; OPT-NOT: call
; OPT: getelementptr
; OPT: store i8
define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false)
ret void
}
; OPT-LABEL: @variable_memcpy_caller0(
; OPT-NOT: call
; OPT: phi
define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
ret void
}
; OPT-LABEL: @variable_memcpy_caller1(
; OPT-NOT: call
; OPT: phi
define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
ret void
}
; OPT-LABEL: @memcpy_multi_use_one_function(
; OPT-NOT: call
; OPT: phi
; OPT-NOT: call
; OPT: phi
; OPT-NOT: call
define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false)
ret void
}
; OPT-LABEL: @memcpy_alt_type(
; OPT: phi
; OPT: getelementptr inbounds i8, i8 addrspace(3)*
; OPT: load i8, i8 addrspace(3)*
; OPT: getelementptr inbounds i8, i8 addrspace(1)*
; OPT: store i8
define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false)
ret void
}
; One of the uses in the function should be expanded, the other left alone.
; OPT-LABEL: @memcpy_multi_use_one_function_keep_small(
; OPT: getelementptr inbounds i8, i8 addrspace(1)*
; OPT: load i8, i8 addrspace(1)*
; OPT: getelementptr inbounds i8, i8 addrspace(1)*
; OPT: store i8
; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
define void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
ret void
}
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind }