From 1ba93908c7ebda8692cbe8df89d13d3cf9e34382 Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Fri, 26 Oct 2018 13:18:36 +0000 Subject: [PATCH] [AMDGPU] Add a pass to promote bitcast calls AMDGPU currently only supports direct calls, but at lower optimisation levels it fails to lower statically direct calls which appear indirect due to a bitcast. Add a pass to visit all CallSites and use CallPromotionUtils to "devirtualize" calls. Differential Revision: https://reviews.llvm.org/D52741 llvm-svn: 345382 --- lib/Target/AMDGPU/AMDGPU.h | 4 + .../AMDGPU/AMDGPUFixFunctionBitcasts.cpp | 63 ++++++++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 + lib/Target/AMDGPU/CMakeLists.txt | 1 + test/CodeGen/AMDGPU/call-constexpr.ll | 140 ++++++++++++++++++ .../AMDGPU/promote-alloca-bitcast-function.ll | 14 +- test/CodeGen/AMDGPU/unsupported-calls.ll | 2 +- 7 files changed, 222 insertions(+), 8 deletions(-) create mode 100644 lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp create mode 100644 test/CodeGen/AMDGPU/call-constexpr.ll diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 5e8a402fb6e..457ec9f9a95 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -77,6 +77,10 @@ ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +ModulePass *createAMDGPUFixFunctionBitcastsPass(); +void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); +extern char &AMDGPUFixFunctionBitcastsID; + FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp new file mode 100644 index 00000000000..6e2a981d339 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -0,0 +1,63 @@ +//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Promote indirect (bitcast) calls to direct calls when they are statically +/// known to be direct. Required when InstCombine is not run (e.g. at OptNone) +/// because AMDGPU does not support indirect calls. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-fix-function-bitcasts" + +namespace { +class AMDGPUFixFunctionBitcasts final + : public ModulePass, + public InstVisitor { + + bool runOnModule(Module &M) override; + + bool Modified; + +public: + void visitCallSite(CallSite CS) { + if (CS.getCalledFunction()) + return; + auto Callee = dyn_cast(CS.getCalledValue()->stripPointerCasts()); + if (Callee && isLegalToPromote(CS, Callee)) { + promoteCall(CS, Callee); + Modified = true; + } + } + + static char ID; + AMDGPUFixFunctionBitcasts() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char AMDGPUFixFunctionBitcasts::ID = 0; +char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID; +INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE, + "Fix function bitcasts for AMDGPU", false, false) + +ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() { + return new AMDGPUFixFunctionBitcasts(); +} + +bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) { + Modified = false; + visit(M); + return Modified; +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ef54100a9c4..6d39c254c73 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -166,6 +166,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); @@ -611,6 +612,11 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&PatchableFunctionID); addPass(createAtomicExpandPass()); + + // This must occur before inlining, as the inliner will not look through + // bitcast calls. + addPass(createAMDGPUFixFunctionBitcastsPass()); + addPass(createAMDGPULowerIntrinsicsPass()); // Function calls are not supported, so make sure we inline everything. diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 5af27cd1d8c..3c87dc18827 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInstrInfo.cpp diff --git a/test/CodeGen/AMDGPU/call-constexpr.ll b/test/CodeGen/AMDGPU/call-constexpr.ll new file mode 100644 index 00000000000..e0a39680bdf --- /dev/null +++ b/test/CodeGen/AMDGPU/call-constexpr.ll @@ -0,0 +1,140 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-fix-function-bitcasts < %s | FileCheck -check-prefix=OPT %s + +; GCN-LABEL: {{^}}test_bitcast_return_type_noinline: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_bitcast_return_type_noinline( +; OPT: %val = call i32 @ret_i32_noinline() +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 { + %val = call float bitcast (i32()* @ret_i32_noinline to float()*)() + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline: +; GCN-NOT: s_getpc_b64 +; GCN-NOT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@lo+4 +; GCN-NOT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@hi+4 +; GCN-NOT: s_swappc_b64 +; OPT-LABEL: @test_bitcast_return_type_alwaysinline( +; OPT: %val = call i32 @ret_i32_alwaysinline() +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 { + %val = call float bitcast (i32()* @ret_i32_alwaysinline to float()*)() + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_argument_type: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_bitcast_argument_type( +; OPT: %1 = bitcast float 2.000000e+00 to i32 +; OPT: %val = call i32 @ident_i32(i32 %1) +; OPT-NOT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_argument_type() #0 { + %val = call i32 bitcast (i32(i32)* @ident_i32 to i32(float)*)(float 2.0) + %op = add i32 %val, 1 + store volatile i32 %op, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_bitcast_argument_and_return_types( +; OPT: %1 = bitcast float 2.000000e+00 to i32 +; OPT: %val = call i32 @ident_i32(i32 %1) +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { + %val = call float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0) + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 +define i32 @use_workitem_id_x(i32 %arg0) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %op = add i32 %id, %arg0 + ret i32 %op +} + +; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: +; GCN: v_mov_b32_e32 v1, v0 +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+4 +; GCN: v_mov_b32_e32 v0, 9 +; GCN: s_swappc_b64 +; GCN: v_add_f32_e32 +; OPT-LABEL: @use_workitem_id_x( +; OPT: %val = call i32 @use_workitem_id_x(i32 9) +; OPT: bitcast i32 %val to float +define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 { + %val = call float bitcast (i32(i32)* @use_workitem_id_x to float(i32)*)(i32 9) + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_invoke: +; GCN: s_getpc_b64 +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4 +; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4 +; GCN: s_swappc_b64 +; OPT-LABEL: @test_invoke( +; OPT: %1 = bitcast float 2.000000e+00 to i32 +; OPT: %val = invoke i32 @ident_i32(i32 %1) +; OPT-NEXT: to label %continue unwind label %broken +; OPT-LABEL: continue.split: +; OPT: bitcast i32 %val to float +@_ZTIi = external global i8* +declare i32 @__gxx_personality_v0(...) +define amdgpu_kernel void @test_invoke() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + %val = invoke float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0) + to label %continue unwind label %broken + +broken: + landingpad { i8*, i32 } catch i8** @_ZTIi + ret void + +continue: + %op = fadd float %val, 1.0 + store volatile float %op, float addrspace(1)* undef + ret void +} + +; Callees appears last in source file to test that we still lower their +; arguments before we lower any calls to them. + +define i32 @ret_i32_noinline() #0 { + ret i32 4 +} + +define i32 @ret_i32_alwaysinline() #1 { + ret i32 4 +} + +define i32 @ident_i32(i32 %i) #0 { + ret i32 %i +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind noinline } +attributes #1 = { alwaysinline nounwind } +attributes #2 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll index 19e89ce97a9..5d8863f4337 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll @@ -1,8 +1,4 @@ -; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck %s - -; FIXME: Error is misleading because it's not an indirect call. - -; CHECK: error: :0:0: in function crash_call_constexpr_cast void (): unsupported indirect call to function foo +; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called ; function is a constantexpr cast of a function. @@ -10,14 +6,18 @@ declare void @foo(float addrspace(5)*) #0 declare void @foo.varargs(...) #0 -; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo +; CHECK-LABEL: @crash_call_constexpr_cast( +; CHECK: alloca +; CHECK: call void define amdgpu_kernel void @crash_call_constexpr_cast() #0 { %alloca = alloca i32, addrspace(5) call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 ret void } -; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs +; CHECK-LABEL: @crash_call_constexpr_cast_varargs( +; CHECK: alloca +; CHECK: call void define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 { %alloca = alloca i32, addrspace(5) call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0 diff --git a/test/CodeGen/AMDGPU/unsupported-calls.ll b/test/CodeGen/AMDGPU/unsupported-calls.ll index 2b6e15b79a4..303a0d6a114 100644 --- a/test/CodeGen/AMDGPU/unsupported-calls.ll +++ b/test/CodeGen/AMDGPU/unsupported-calls.ll @@ -53,7 +53,7 @@ define void @test_call_varargs() { declare i32 @extern_variadic(...) -; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported indirect call to function extern_variadic +; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to variadic function extern_variadic ; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) { %add = fadd <4 x float> %arg0, %arg1