1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

AMDGPU: Fix breaking IR on instructions with multiple pointer operands

The promote alloca pass would attempt to promote an alloca with
a select, icmp, or phi user, even though the other operand was
from a non-promotable source, producing a select on two different
pointer types.

Only do this if we know that both operands derive from the same
alloca. In the future we should be able to relax this to an alloca
which will also be promoted.

llvm-svn: 269265
This commit is contained in:
Matt Arsenault 2016-05-12 01:58:58 +00:00
parent ed41711e86
commit ac3313688f
4 changed files with 401 additions and 8 deletions

View File

@ -32,6 +32,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
private:
const TargetMachine *TM;
Module *Mod;
const DataLayout *DL;
MDNode *MaxWorkGroupSizeRange;
// FIXME: This should be per-kernel.
@ -43,6 +44,20 @@ private:
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
/// BaseAlloca is the alloca root the search started from.
/// Val may be that alloca or a recursive user of it.
bool collectUsesWithPtrTypes(Value *BaseAlloca,
Value *Val,
std::vector<Value*> &WorkList) const;
/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
/// Returns true if both operands are derived from the same alloca. Val should
/// be the same value as one of the input operands of UseInst.
bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
Instruction *UseInst,
int OpIdx0, int OpIdx1) const;
public:
static char ID;
@ -50,6 +65,7 @@ public:
FunctionPass(ID),
TM(TM_),
Mod(nullptr),
DL(nullptr),
MaxWorkGroupSizeRange(nullptr),
LocalMemAvailable(0),
IsAMDGCN(false),
@ -63,6 +79,11 @@ public:
}
void handleAlloca(AllocaInst &I);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
FunctionPass::getAnalysisUsage(AU);
}
};
} // End anonymous namespace
@ -80,6 +101,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
return false;
Mod = &M;
DL = &Mod->getDataLayout();
// The maximum workitem id.
//
@ -131,8 +153,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
continue;
if (Use->getParent()->getParent() == &F) {
LocalMemAvailable -=
Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
LocalMemAvailable -= DL->getTypeAllocSize(GV.getValueType());
break;
}
}
@ -428,7 +449,39 @@ static bool isCallPromotable(CallInst *CI) {
}
}
static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
Value *Val,
Instruction *Inst,
int OpIdx0,
int OpIdx1) const {
// Figure out which operand is the one we might not be promoting.
Value *OtherOp = Inst->getOperand(OpIdx0);
if (Val == OtherOp)
OtherOp = Inst->getOperand(OpIdx1);
Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
if (!isa<AllocaInst>(OtherObj))
return false;
// TODO: We should be able to replace undefs with the right pointer type.
// TODO: If we know the other base object is another promotable
// alloca, not necessarily this alloca, we can do this. The
// important part is both must have the same address space at
// the end.
if (OtherObj != BaseAlloca) {
DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
return false;
}
return true;
}
bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
Value *BaseAlloca,
Value *Val,
std::vector<Value*> &WorkList) const {
for (User *User : Val->users()) {
if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
continue;
@ -441,11 +494,11 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
continue;
}
Instruction *UseInst = dyn_cast<Instruction>(User);
if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
Instruction *UseInst = cast<Instruction>(User);
if (UseInst->getOpcode() == Instruction::PtrToInt)
return false;
if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
if (SI->isVolatile())
return false;
@ -464,6 +517,13 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
return false;
}
// Only promote a select if we know that the other select operand
// is from another pointer that will also be promoted.
if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
return false;
}
if (!User->getType()->isPointerTy())
continue;
@ -474,8 +534,31 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
return false;
}
// Only promote a select if we know that the other select operand is from
// another pointer that will also be promoted.
if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
return false;
}
// Repeat for phis.
if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
// TODO: Handle more complex cases. We should be able to replace loops
// over arrays.
switch (Phi->getNumIncomingValues()) {
case 1:
break;
case 2:
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
return false;
break;
default:
return false;
}
}
WorkList.push_back(User);
if (!collectUsesWithPtrTypes(User, WorkList))
if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
return false;
}
@ -516,7 +599,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
std::vector<Value*> WorkList;
if (!collectUsesWithPtrTypes(&I, WorkList)) {
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");
return;
}

View File

@ -0,0 +1,38 @@
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
; This normally would be fixed by instcombine to be compare to the GEP
; indices
; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 %{{[0-9]+}}
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%alloca = alloca [16 x i32], align 4
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
%cmp = icmp eq i32* %ptr0, %ptr1
%zext = zext i1 %cmp to i32
store volatile i32 %zext, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
; CHECK: %alloca = alloca [16 x i32], align 4
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
; CHECK: %ptr1 = call i32* @get_unknown_pointer()
; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%alloca = alloca [16 x i32], align 4
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
%ptr1 = call i32* @get_unknown_pointer()
%cmp = icmp eq i32* %ptr0, %ptr1
%zext = zext i1 %cmp to i32
store volatile i32 %zext, i32 addrspace(1)* %out
ret void
}
declare i32* @get_unknown_pointer() #0
attributes #0 = { nounwind }

View File

@ -0,0 +1,170 @@
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
; CHECK-LABEL: @branch_ptr_var_same_alloca(
; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
; CHECK: if:
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
; CHECK: else:
; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
; CHECK: endif:
; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
entry:
%alloca = alloca [64 x i32], align 4
br i1 undef, label %if, label %else
if:
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
br label %endif
else:
%arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b
br label %endif
endif:
%phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
store i32 0, i32* %phi.ptr, align 4
ret void
}
; CHECK-LABEL: @one_phi_value(
; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
; CHECK: br label %exit
; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
define void @one_phi_value(i32 %a) #0 {
entry:
%alloca = alloca [64 x i32], align 4
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
br label %exit
exit:
%phi.ptr = phi i32* [ %arrayidx0, %entry ]
store i32 0, i32* %phi.ptr, align 4
ret void
}
; CHECK-LABEL: @branch_ptr_alloca_unknown_obj(
; CHECK: %alloca = alloca [64 x i32], align 4
; CHECK: if:
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
; CHECK: else:
; CHECK: %arrayidx1 = call i32* @get_unknown_pointer()
; CHECK: endif:
; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
; CHECK: store i32 0, i32* %phi.ptr, align 4
define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
entry:
%alloca = alloca [64 x i32], align 4
br i1 undef, label %if, label %else
if:
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
br label %endif
else:
%arrayidx1 = call i32* @get_unknown_pointer()
br label %endif
endif:
%phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
store i32 0, i32* %phi.ptr, align 4
ret void
}
; kernel void ptr_induction_var_same_alloca(void)
; {
; int alloca[64];
; int i = 0;
; #pragma nounroll
; for (int* p = &alloca[2], *e = &alloca[48]; p != e; ++p, ++i)
; {
; *p = i;
; }
; }
; FIXME: This should be promotable. We need to use
; GetUnderlyingObjects when looking at the icmp user.
; CHECK-LABEL: @ptr_induction_var_same_alloca(
; CHECK: %alloca = alloca [64 x i32], align 4
; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
define void @ptr_induction_var_same_alloca() #0 {
entry:
%alloca = alloca [64 x i32], align 4
%arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
%arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
store i32 %i.09, i32* %p.08, align 4
%incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
%inc = add nuw nsw i32 %i.09, 1
%cmp = icmp eq i32* %incdec.ptr, %arrayidx1
br i1 %cmp, label %for.cond.cleanup, label %for.body
}
; extern int* get_unknown_pointer(void);
; kernel void ptr_induction_var_alloca_unknown(void)
; {
; int alloca[64];
; int i = 0;
;
; for (int* p = &alloca[2], *e = get_unknown_pointer(); p != e; ++p, ++i)
; {
; *p = i;
; }
; }
; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
; CHECK: %alloca = alloca [64 x i32], align 4
; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
define void @ptr_induction_var_alloca_unknown() #0 {
entry:
%alloca = alloca [64 x i32], align 4
%arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
%call = tail call i32* @get_unknown_pointer() #2
%cmp.7 = icmp eq i32* %arrayidx, %call
br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.body, %for.body.preheader
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
store i32 %i.09, i32* %p.08, align 4
%incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
%inc = add nuw nsw i32 %i.09, 1
%cmp = icmp eq i32* %incdec.ptr, %call
br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
}
declare i32* @get_unknown_pointer() #0
attributes #0 = { nounwind }

View File

@ -0,0 +1,102 @@
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
; CHECK: %alloca = alloca i32
; CHECK: select i1 undef, i32* undef, i32* %alloca
define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
%alloca = alloca i32, align 4
%select = select i1 undef, i32* undef, i32* %alloca
store i32 0, i32* %select, align 4
ret void
}
; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
%alloca = alloca [16 x i32], align 4
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
%select = select i1 undef, i32* %ptr0, i32* %ptr1
store i32 0, i32* %select, align 4
ret void
}
; FIXME: This should be promotable but requires knowing that both will be promoted first.
; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
; CHECK: %alloca0 = alloca i32, i32 16, align 4
; CHECK: %alloca1 = alloca i32, i32 16, align 4
; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
%alloca0 = alloca i32, i32 16, align 4
%alloca1 = alloca i32, i32 16, align 4
%ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
%ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
%select = select i1 undef, i32* %ptr0, i32* %ptr1
store i32 0, i32* %select, align 4
ret void
}
; TODO: Maybe this should be canonicalized to select on the constant and GEP after.
; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 1
; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
%alloca = alloca [16 x i32], align 4
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
%select = select i1 undef, i32* %ptr0, i32* %ptr1
store i32 0, i32* %select, align 4
ret void
}
; CHECK-LABEL: @lds_promoted_alloca_select_input_select(
; CHECK: getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_select_input_select.alloca, i32 0, i32 %{{[0-9]+}}
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %b
; CHECK: %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %c
; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
%alloca = alloca [16 x i32], align 4
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
%ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
%select0 = select i1 undef, i32* %ptr0, i32* %ptr1
%select1 = select i1 undef, i32* %select0, i32* %ptr2
store i32 0, i32* %select1, align 4
ret void
}
define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
entry:
%alloca = alloca [16 x i32], align 4
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
store i32 0, i32* %ptr0
br i1 undef, label %bb1, label %bb2
bb1:
%ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
%select0 = select i1 undef, i32* undef, i32* %ptr2
store i32 0, i32* %ptr1
br label %bb2
bb2:
%phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ]
%select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1
store i32 0, i32* %select1, align 4
ret void
}
attributes #0 = { norecurse nounwind }