mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
AMDGPU/R600: Convert kernel param loads to use PARAM_I_ADDRESS
Non ext aligned i32 loads are still optimized to use CONSTANT_BUFFER (AS 8) llvm-svn: 338610
This commit is contained in:
parent
5d0ac84a37
commit
eb9e325c3b
@ -903,7 +903,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
|
||||
unsigned DwordOffset) const {
|
||||
unsigned ByteOffset = DwordOffset * 4;
|
||||
PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
|
||||
AMDGPUASI.CONSTANT_BUFFER_0);
|
||||
AMDGPUASI.PARAM_I_ADDRESS);
|
||||
|
||||
// We shouldn't be using an offset wider than 16-bits for implicit parameters.
|
||||
assert(isInt<16>(ByteOffset));
|
||||
@ -1457,33 +1457,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
||||
return scalarizeVectorLoad(LoadNode, DAG);
|
||||
}
|
||||
|
||||
// This is still used for explicit load from addrspace(8)
|
||||
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
|
||||
if (ConstantBlock > -1 &&
|
||||
((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
|
||||
(LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
|
||||
SDValue Result;
|
||||
if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
|
||||
isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
|
||||
if (isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
|
||||
isa<ConstantSDNode>(Ptr)) {
|
||||
SDValue Slots[4];
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
// We want Const position encoded with the following formula :
|
||||
// (((512 + (kc_bank << 12) + const_index) << 2) + chan)
|
||||
// const_index is Ptr computed by llvm using an alignment of 16.
|
||||
// Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
|
||||
// then div by 4 at the ISel step
|
||||
SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
|
||||
DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
|
||||
Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
|
||||
}
|
||||
EVT NewVT = MVT::v4i32;
|
||||
unsigned NumElements = 4;
|
||||
if (VT.isVector()) {
|
||||
NewVT = VT;
|
||||
NumElements = VT.getVectorNumElements();
|
||||
}
|
||||
Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
|
||||
return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG);
|
||||
} else {
|
||||
//TODO: Does this even work?
|
||||
// non-constant ptr can't be folded, keeps it as a v4f32 load
|
||||
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
|
||||
DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
|
||||
@ -1622,7 +1606,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
|
||||
}
|
||||
|
||||
PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
|
||||
AMDGPUASI.CONSTANT_BUFFER_0);
|
||||
AMDGPUASI.PARAM_I_ADDRESS);
|
||||
|
||||
// i64 isn't a legal type, so the register type used ends up as i32, which
|
||||
// isn't expected here. It attempts to create this sextload, but it ends up
|
||||
@ -1646,17 +1630,17 @@ SDValue R600TargetLowering::LowerFormalArguments(
|
||||
|
||||
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
|
||||
unsigned PartOffset = VA.getLocMemOffset();
|
||||
unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset);
|
||||
|
||||
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
|
||||
SDValue Arg = DAG.getLoad(
|
||||
ISD::UNINDEXED, Ext, VT, DL, Chain,
|
||||
DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
|
||||
PtrInfo,
|
||||
MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
|
||||
MemVT, Alignment, MachineMemOperand::MONonTemporal |
|
||||
MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOInvariant);
|
||||
|
||||
// 4 is the preferred alignment for the CONSTANT memory space.
|
||||
InVals.push_back(Arg);
|
||||
}
|
||||
return Chain;
|
||||
@ -1804,6 +1788,52 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
|
||||
return BuildVector;
|
||||
}
|
||||
|
||||
SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc DL(LoadNode);
|
||||
EVT VT = LoadNode->getValueType(0);
|
||||
SDValue Chain = LoadNode->getChain();
|
||||
SDValue Ptr = LoadNode->getBasePtr();
|
||||
assert (isa<ConstantSDNode>(Ptr));
|
||||
|
||||
//TODO: Support smaller loads
|
||||
if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
|
||||
return SDValue();
|
||||
|
||||
if (LoadNode->getAlignment() < 4)
|
||||
return SDValue();
|
||||
|
||||
int ConstantBlock = ConstantAddressBlock(Block);
|
||||
|
||||
SDValue Slots[4];
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
// We want Const position encoded with the following formula :
|
||||
// (((512 + (kc_bank << 12) + const_index) << 2) + chan)
|
||||
// const_index is Ptr computed by llvm using an alignment of 16.
|
||||
// Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
|
||||
// then div by 4 at the ISel step
|
||||
SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
|
||||
DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
|
||||
Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
|
||||
}
|
||||
EVT NewVT = MVT::v4i32;
|
||||
unsigned NumElements = 4;
|
||||
if (VT.isVector()) {
|
||||
NewVT = VT;
|
||||
NumElements = VT.getVectorNumElements();
|
||||
}
|
||||
SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
|
||||
if (!VT.isVector()) {
|
||||
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
|
||||
DAG.getConstant(0, DL, MVT::i32));
|
||||
}
|
||||
SDValue MergedValues[2] = {
|
||||
Result,
|
||||
Chain
|
||||
};
|
||||
return DAG.getMergeValues(MergedValues, DL);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Custom DAG Optimizations
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -2022,6 +2052,16 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
|
||||
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
|
||||
}
|
||||
|
||||
case ISD::LOAD: {
|
||||
LoadSDNode *LoadNode = cast<LoadSDNode>(N);
|
||||
SDValue Ptr = LoadNode->getBasePtr();
|
||||
if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS &&
|
||||
isa<ConstantSDNode>(Ptr))
|
||||
return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG);
|
||||
break;
|
||||
}
|
||||
|
||||
default: break;
|
||||
}
|
||||
|
||||
|
@ -98,9 +98,11 @@ private:
|
||||
bool isHWTrueValue(SDValue Op) const;
|
||||
bool isHWFalseValue(SDValue Op) const;
|
||||
|
||||
bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
|
||||
SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
|
||||
SelectionDAG &DAG) const;
|
||||
bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
|
||||
SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
|
||||
SelectionDAG &DAG) const;
|
||||
SDValue constBufferLoad(LoadSDNode *LoadNode, int Block,
|
||||
SelectionDAG &DAG) const;
|
||||
|
||||
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
|
||||
};
|
||||
|
@ -16,13 +16,8 @@
|
||||
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
|
||||
|
||||
|
||||
; EG: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MOV * T1.X, KC0[2].Z,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
|
||||
; CM: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; CM-NEXT: MOV * T1.X, KC0[2].Z,
|
||||
; EGCM: VTX_READ_8{{.*}} #3
|
||||
; EGCM: KC0[2].Y
|
||||
define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
|
||||
%ext = zext i8 %in to i32
|
||||
store i32 %ext, i32 addrspace(1)* %out, align 4
|
||||
@ -92,14 +87,8 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign
|
||||
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
|
||||
; HSA-VI: flat_store_dword
|
||||
|
||||
|
||||
; EG: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MOV * T1.X, KC0[2].Z,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
|
||||
; CM: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; CM-NEXT: MOV * T1.X, KC0[2].Z,
|
||||
; EGCM: VTX_READ_16
|
||||
; EGCM: KC0[2].Y
|
||||
define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
|
||||
%ext = zext i16 %in to i32
|
||||
store i32 %ext, i32 addrspace(1)* %out, align 4
|
||||
|
@ -60,8 +60,11 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_implicit:
|
||||
; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
|
||||
; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
|
||||
; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56 == KC0[3].Z
|
||||
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], [[PTR:T[0-9]+.[XYZW]]]
|
||||
; EG-NOT: VTX_READ
|
||||
; EG-DAG: MOV {{\*?}} [[VAL]], KC0[3].Z
|
||||
; EG-DAG: LSHR {{\*? *}}[[PTR]], KC0[2].Y, literal
|
||||
define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
|
||||
%implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
|
||||
%header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
|
||||
@ -73,7 +76,7 @@ define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
|
||||
|
||||
; FUNC-LABEL: {{^}}test_implicit_dyn:
|
||||
; 36 prepended implicit bytes + 8(out pointer + in) = 44
|
||||
; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
|
||||
; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44, #3
|
||||
define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
|
||||
%implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
|
||||
%header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
|
||||
|
Loading…
x
Reference in New Issue
Block a user