mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization
Frame-base materialization may insert vector instructions before EXEC is initialised. Fix this by moving lowering of llvm.amdgcn.init.exec later in backend. Also remove SI_INIT_EXEC_LO pseudo as this is not necessary. Reviewed By: ruiling Differential Revision: https://reviews.llvm.org/D94645
This commit is contained in:
parent
f6cb3fe42b
commit
8b5995e559
@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[],
|
|||||||
// Set EXEC according to a thread count packed in an SGPR input:
|
// Set EXEC according to a thread count packed in an SGPR input:
|
||||||
// thread_count = (input >> bitoffset) & 0x7f;
|
// thread_count = (input >> bitoffset) & 0x7f;
|
||||||
// This is always moved to the beginning of the basic block.
|
// This is always moved to the beginning of the basic block.
|
||||||
|
// Note: only inreg arguments to the parent function are valid as
|
||||||
|
// inputs to this intrinsic, computed values cannot be used.
|
||||||
def int_amdgcn_init_exec_from_input : Intrinsic<[],
|
def int_amdgcn_init_exec_from_input : Intrinsic<[],
|
||||||
[llvm_i32_ty, // 32-bit SGPR input
|
[llvm_i32_ty, // 32-bit SGPR input
|
||||||
llvm_i32_ty], // bit offset of the thread count
|
llvm_i32_ty], // bit offset of the thread count
|
||||||
|
@ -4021,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
|||||||
MI.eraseFromParent();
|
MI.eraseFromParent();
|
||||||
return BB;
|
return BB;
|
||||||
}
|
}
|
||||||
case AMDGPU::SI_INIT_EXEC:
|
|
||||||
// This should be before all vector instructions.
|
|
||||||
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
|
|
||||||
AMDGPU::EXEC)
|
|
||||||
.addImm(MI.getOperand(0).getImm());
|
|
||||||
MI.eraseFromParent();
|
|
||||||
return BB;
|
|
||||||
|
|
||||||
case AMDGPU::SI_INIT_EXEC_LO:
|
|
||||||
// This should be before all vector instructions.
|
|
||||||
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
|
|
||||||
AMDGPU::EXEC_LO)
|
|
||||||
.addImm(MI.getOperand(0).getImm());
|
|
||||||
MI.eraseFromParent();
|
|
||||||
return BB;
|
|
||||||
|
|
||||||
case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
|
|
||||||
// Extract the thread count from an SGPR input and set EXEC accordingly.
|
|
||||||
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
|
|
||||||
//
|
|
||||||
// S_BFE_U32 count, input, {shift, 7}
|
|
||||||
// S_BFM_B64 exec, count, 0
|
|
||||||
// S_CMP_EQ_U32 count, 64
|
|
||||||
// S_CMOV_B64 exec, -1
|
|
||||||
MachineInstr *FirstMI = &*BB->begin();
|
|
||||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
||||||
Register InputReg = MI.getOperand(0).getReg();
|
|
||||||
Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
||||||
bool Found = false;
|
|
||||||
|
|
||||||
// Move the COPY of the input reg to the beginning, so that we can use it.
|
|
||||||
for (auto I = BB->begin(); I != &MI; I++) {
|
|
||||||
if (I->getOpcode() != TargetOpcode::COPY ||
|
|
||||||
I->getOperand(0).getReg() != InputReg)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (I == FirstMI) {
|
|
||||||
FirstMI = &*++BB->begin();
|
|
||||||
} else {
|
|
||||||
I->removeFromParent();
|
|
||||||
BB->insert(FirstMI, &*I);
|
|
||||||
}
|
|
||||||
Found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
assert(Found);
|
|
||||||
(void)Found;
|
|
||||||
|
|
||||||
// This should be before all vector instructions.
|
|
||||||
unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
|
|
||||||
bool isWave32 = getSubtarget()->isWave32();
|
|
||||||
unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
|
||||||
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
|
|
||||||
.addReg(InputReg)
|
|
||||||
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
|
|
||||||
BuildMI(*BB, FirstMI, DebugLoc(),
|
|
||||||
TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
|
|
||||||
Exec)
|
|
||||||
.addReg(CountReg)
|
|
||||||
.addImm(0);
|
|
||||||
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
|
|
||||||
.addReg(CountReg, RegState::Kill)
|
|
||||||
.addImm(getSubtarget()->getWavefrontSize());
|
|
||||||
BuildMI(*BB, FirstMI, DebugLoc(),
|
|
||||||
TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
|
|
||||||
Exec)
|
|
||||||
.addImm(-1);
|
|
||||||
MI.eraseFromParent();
|
|
||||||
return BB;
|
|
||||||
}
|
|
||||||
|
|
||||||
case AMDGPU::GET_GROUPSTATICSIZE: {
|
case AMDGPU::GET_GROUPSTATICSIZE: {
|
||||||
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
|
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
|
||||||
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
|
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
|
||||||
|
@ -399,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
|
|||||||
(outs), (ins i64imm:$src),
|
(outs), (ins i64imm:$src),
|
||||||
[(int_amdgcn_init_exec (i64 timm:$src))]> {
|
[(int_amdgcn_init_exec (i64 timm:$src))]> {
|
||||||
let Defs = [EXEC];
|
let Defs = [EXEC];
|
||||||
let usesCustomInserter = 1;
|
|
||||||
let isAsCheapAsAMove = 1;
|
let isAsCheapAsAMove = 1;
|
||||||
let WaveSizePredicate = isWave64;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: Intrinsic should be mangled for wave size.
|
|
||||||
def SI_INIT_EXEC_LO : SPseudoInstSI <
|
|
||||||
(outs), (ins i32imm:$src), []> {
|
|
||||||
let Defs = [EXEC_LO];
|
|
||||||
let usesCustomInserter = 1;
|
|
||||||
let isAsCheapAsAMove = 1;
|
|
||||||
let WaveSizePredicate = isWave32;
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: Wave32 version
|
|
||||||
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
|
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
|
||||||
(outs), (ins SSrc_b32:$input, i32imm:$shift),
|
(outs), (ins SSrc_b32:$input, i32imm:$shift),
|
||||||
[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
|
[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
|
||||||
let Defs = [EXEC];
|
let Defs = [EXEC];
|
||||||
let usesCustomInserter = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
def : GCNPat <
|
|
||||||
(int_amdgcn_init_exec timm:$src),
|
|
||||||
(SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
|
|
||||||
let WaveSizePredicate = isWave32;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return for returning shaders to a shader variant epilog.
|
// Return for returning shaders to a shader variant epilog.
|
||||||
|
@ -93,6 +93,8 @@ private:
|
|||||||
|
|
||||||
MachineBasicBlock *emitEndCf(MachineInstr &MI);
|
MachineBasicBlock *emitEndCf(MachineInstr &MI);
|
||||||
|
|
||||||
|
void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
|
||||||
|
|
||||||
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
|
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
|
||||||
SmallVectorImpl<MachineOperand> &Src) const;
|
SmallVectorImpl<MachineOperand> &Src) const;
|
||||||
|
|
||||||
@ -661,6 +663,90 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
|
|||||||
return SplitBB;
|
return SplitBB;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
|
||||||
|
MachineInstr &MI) {
|
||||||
|
MachineFunction &MF = *MBB->getParent();
|
||||||
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||||
|
bool IsWave32 = ST.isWave32();
|
||||||
|
|
||||||
|
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
|
||||||
|
// This should be before all vector instructions.
|
||||||
|
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
|
||||||
|
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
|
||||||
|
.addImm(MI.getOperand(0).getImm());
|
||||||
|
if (LIS)
|
||||||
|
LIS->RemoveMachineInstrFromMaps(MI);
|
||||||
|
MI.eraseFromParent();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the thread count from an SGPR input and set EXEC accordingly.
|
||||||
|
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
|
||||||
|
//
|
||||||
|
// S_BFE_U32 count, input, {shift, 7}
|
||||||
|
// S_BFM_B64 exec, count, 0
|
||||||
|
// S_CMP_EQ_U32 count, 64
|
||||||
|
// S_CMOV_B64 exec, -1
|
||||||
|
Register InputReg = MI.getOperand(0).getReg();
|
||||||
|
MachineInstr *FirstMI = &*MBB->begin();
|
||||||
|
if (InputReg.isVirtual()) {
|
||||||
|
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
|
||||||
|
assert(DefInstr && DefInstr->isCopy());
|
||||||
|
if (DefInstr->getParent() == MBB) {
|
||||||
|
if (DefInstr != FirstMI) {
|
||||||
|
// If the `InputReg` is defined in current block, we also need to
|
||||||
|
// move that instruction to the beginning of the block.
|
||||||
|
DefInstr->removeFromParent();
|
||||||
|
MBB->insert(FirstMI, DefInstr);
|
||||||
|
if (LIS)
|
||||||
|
LIS->handleMove(*DefInstr);
|
||||||
|
} else {
|
||||||
|
// If first instruction is definition then move pointer after it.
|
||||||
|
FirstMI = &*std::next(FirstMI->getIterator());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert instruction sequence at block beginning (before vector operations).
|
||||||
|
const DebugLoc DL = MI.getDebugLoc();
|
||||||
|
const unsigned WavefrontSize = ST.getWavefrontSize();
|
||||||
|
const unsigned Mask = (WavefrontSize << 1) - 1;
|
||||||
|
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
||||||
|
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
|
||||||
|
.addReg(InputReg)
|
||||||
|
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
|
||||||
|
auto BfmMI =
|
||||||
|
BuildMI(*MBB, FirstMI, DL,
|
||||||
|
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
|
||||||
|
.addReg(CountReg)
|
||||||
|
.addImm(0);
|
||||||
|
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
|
||||||
|
.addReg(CountReg, RegState::Kill)
|
||||||
|
.addImm(WavefrontSize);
|
||||||
|
auto CmovMI =
|
||||||
|
BuildMI(*MBB, FirstMI, DL,
|
||||||
|
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
|
||||||
|
Exec)
|
||||||
|
.addImm(-1);
|
||||||
|
|
||||||
|
if (!LIS) {
|
||||||
|
MI.eraseFromParent();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LIS->RemoveMachineInstrFromMaps(MI);
|
||||||
|
MI.eraseFromParent();
|
||||||
|
|
||||||
|
LIS->InsertMachineInstrInMaps(*BfeMI);
|
||||||
|
LIS->InsertMachineInstrInMaps(*BfmMI);
|
||||||
|
LIS->InsertMachineInstrInMaps(*CmpMI);
|
||||||
|
LIS->InsertMachineInstrInMaps(*CmovMI);
|
||||||
|
|
||||||
|
LIS->removeInterval(InputReg);
|
||||||
|
LIS->createAndComputeVirtRegInterval(InputReg);
|
||||||
|
LIS->createAndComputeVirtRegInterval(CountReg);
|
||||||
|
}
|
||||||
|
|
||||||
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
|
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
|
||||||
auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
|
auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
|
||||||
auto *S = B->getNextNode();
|
auto *S = B->getNextNode();
|
||||||
@ -781,6 +867,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
|||||||
SplitMBB = process(MI);
|
SplitMBB = process(MI);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
// FIXME: find a better place for this
|
||||||
|
case AMDGPU::SI_INIT_EXEC:
|
||||||
|
case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
|
||||||
|
lowerInitExec(MBB, MI);
|
||||||
|
if (LIS)
|
||||||
|
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -84,6 +84,117 @@ main_body:
|
|||||||
unreachable
|
unreachable
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
|
||||||
|
; GCN-NOT: {{^}}v_
|
||||||
|
; GCN: s_mov_b64 exec, -1
|
||||||
|
; GCN: v_mov
|
||||||
|
; GCN: v_add
|
||||||
|
define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
|
||||||
|
main_body:
|
||||||
|
%array0 = alloca [1024 x i32], align 16, addrspace(5)
|
||||||
|
%array1 = alloca [20 x i32], align 16, addrspace(5)
|
||||||
|
call void @llvm.amdgcn.init.exec(i64 -1)
|
||||||
|
|
||||||
|
%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
|
||||||
|
store i32 %a, i32 addrspace(5)* %ptr0, align 4
|
||||||
|
|
||||||
|
%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
|
||||||
|
store i32 %a, i32 addrspace(5)* %ptr1, align 4
|
||||||
|
|
||||||
|
%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
|
||||||
|
store i32 %b, i32 addrspace(5)* %ptr2, align 4
|
||||||
|
|
||||||
|
%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
|
||||||
|
%v3 = load i32, i32 addrspace(5)* %ptr3, align 4
|
||||||
|
|
||||||
|
%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
|
||||||
|
%v4 = load i32, i32 addrspace(5)* %ptr4, align 4
|
||||||
|
|
||||||
|
%v5 = add i32 %v3, %v4
|
||||||
|
%v = bitcast i32 %v5 to float
|
||||||
|
ret float %v
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
|
||||||
|
; GCN-NOT: {{^}}v_
|
||||||
|
; GCN: s_bfe_u32 s2, s2, 0x70008
|
||||||
|
; GCN-NEXT: s_bfm_b64 exec, s2, 0
|
||||||
|
; GCN-NEXT: s_cmp_eq_u32 s2, 64
|
||||||
|
; GCN-NEXT: s_cmov_b64 exec, -1
|
||||||
|
; GCN: v_mov
|
||||||
|
; GCN: v_add
|
||||||
|
define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
|
||||||
|
main_body:
|
||||||
|
%array0 = alloca [1024 x i32], align 16, addrspace(5)
|
||||||
|
%array1 = alloca [20 x i32], align 16, addrspace(5)
|
||||||
|
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
|
||||||
|
|
||||||
|
%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
|
||||||
|
store i32 %a, i32 addrspace(5)* %ptr0, align 4
|
||||||
|
|
||||||
|
%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
|
||||||
|
store i32 %a, i32 addrspace(5)* %ptr1, align 4
|
||||||
|
|
||||||
|
%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
|
||||||
|
store i32 %b, i32 addrspace(5)* %ptr2, align 4
|
||||||
|
|
||||||
|
%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
|
||||||
|
%v3 = load i32, i32 addrspace(5)* %ptr3, align 4
|
||||||
|
|
||||||
|
%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
|
||||||
|
%v4 = load i32, i32 addrspace(5)* %ptr4, align 4
|
||||||
|
|
||||||
|
%v5 = add i32 %v3, %v4
|
||||||
|
%v = bitcast i32 %v5 to float
|
||||||
|
ret float %v
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
|
||||||
|
; GCN-NOT: {{^}}v_
|
||||||
|
; GCN: %endif
|
||||||
|
; GCN: s_bfe_u32 s3, s2, 0x70008
|
||||||
|
; GCN-NEXT: s_bfm_b64 exec, s3, 0
|
||||||
|
; GCN-NEXT: s_cmp_eq_u32 s3, 64
|
||||||
|
; GCN-NEXT: s_cmov_b64 exec, -1
|
||||||
|
; GCN: v_mov
|
||||||
|
; GCN: v_add
|
||||||
|
define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
|
||||||
|
main_body:
|
||||||
|
; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
|
||||||
|
%array0 = alloca [1024 x i32], align 16, addrspace(5)
|
||||||
|
%array1 = alloca [20 x i32], align 16, addrspace(5)
|
||||||
|
|
||||||
|
%cc = icmp uge i32 %count, 32
|
||||||
|
br i1 %cc, label %endif, label %if
|
||||||
|
|
||||||
|
if:
|
||||||
|
call void asm sideeffect "", ""()
|
||||||
|
br label %endif
|
||||||
|
|
||||||
|
endif:
|
||||||
|
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
|
||||||
|
|
||||||
|
%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
|
||||||
|
store i32 %a, i32 addrspace(5)* %ptr0, align 4
|
||||||
|
|
||||||
|
%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
|
||||||
|
store i32 %a, i32 addrspace(5)* %ptr1, align 4
|
||||||
|
|
||||||
|
%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
|
||||||
|
store i32 %b, i32 addrspace(5)* %ptr2, align 4
|
||||||
|
|
||||||
|
%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
|
||||||
|
%v3 = load i32, i32 addrspace(5)* %ptr3, align 4
|
||||||
|
|
||||||
|
%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
|
||||||
|
%v4 = load i32, i32 addrspace(5)* %ptr4, align 4
|
||||||
|
|
||||||
|
%v5 = add i32 %v3, %v4
|
||||||
|
%v6 = add i32 %v5, %count
|
||||||
|
%v = bitcast i32 %v6 to float
|
||||||
|
ret float %v
|
||||||
|
}
|
||||||
|
|
||||||
declare void @llvm.amdgcn.init.exec(i64) #1
|
declare void @llvm.amdgcn.init.exec(i64) #1
|
||||||
declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
|
declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user