mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
AMDGPU/GlobalISel: Apply load bitcast to s.buffer.load intrinsic
Should also apply this to the non-scalar buffer loads.
This commit is contained in:
parent
38a46e3c56
commit
59ad55e97d
@ -122,20 +122,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
|
||||
};
|
||||
}
|
||||
|
||||
static LLT getBitcastRegisterType(const LLT Ty) {
|
||||
const unsigned Size = Ty.getSizeInBits();
|
||||
|
||||
LLT CoercedTy;
|
||||
if (Size <= 32) {
|
||||
// <2 x s8> -> s16
|
||||
// <4 x s8> -> s32
|
||||
return LLT::scalar(Size);
|
||||
}
|
||||
|
||||
return LLT::scalarOrVector(Size / 32, 32);
|
||||
}
|
||||
|
||||
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
|
||||
return [=](const LegalityQuery &Query) {
|
||||
const LLT Ty = Query.Types[TypeIdx];
|
||||
unsigned Size = Ty.getSizeInBits();
|
||||
|
||||
LLT CoercedTy;
|
||||
if (Size <= 32) {
|
||||
// <2 x s8> -> s16
|
||||
// <4 x s8> -> s32
|
||||
CoercedTy = LLT::scalar(Size);
|
||||
} else
|
||||
CoercedTy = LLT::scalarOrVector(Size / 32, 32);
|
||||
|
||||
return std::make_pair(TypeIdx, CoercedTy);
|
||||
return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
|
||||
};
|
||||
}
|
||||
|
||||
@ -335,6 +338,20 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
|
||||
!loadStoreBitcastWorkaround(Ty);
|
||||
}
|
||||
|
||||
/// Return true if a load or store of the type should be lowered with a bitcast
|
||||
/// to a different type.
|
||||
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
|
||||
const unsigned MemSizeInBits) {
|
||||
const unsigned Size = Ty.getSizeInBits();
|
||||
if (Size != MemSizeInBits)
|
||||
return Size <= 32 && Ty.isVector();
|
||||
|
||||
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
|
||||
return true;
|
||||
return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
|
||||
!isRegisterVectorElementType(Ty.getElementType());
|
||||
}
|
||||
|
||||
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
||||
const GCNTargetMachine &TM)
|
||||
: ST(ST_) {
|
||||
@ -1048,16 +1065,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
||||
// 16-bit vector parts.
|
||||
Actions.bitcastIf(
|
||||
[=](const LegalityQuery &Query) -> bool {
|
||||
const LLT Ty = Query.Types[0];
|
||||
const unsigned Size = Ty.getSizeInBits();
|
||||
|
||||
if (Size != Query.MMODescrs[0].SizeInBits)
|
||||
return Size <= 32 && Ty.isVector();
|
||||
|
||||
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
|
||||
return true;
|
||||
return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
|
||||
!isRegisterVectorElementType(Ty.getElementType());
|
||||
return shouldBitcastLoadStoreType(ST, Query.Types[0],
|
||||
Query.MMODescrs[0].SizeInBits);
|
||||
}, bitcastToRegisterType(0));
|
||||
|
||||
Actions
|
||||
@ -4137,8 +4146,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
|
||||
MachineInstr &MI, MachineIRBuilder &B,
|
||||
GISelChangeObserver &Observer) const {
|
||||
LegalizerHelper &Helper, MachineInstr &MI) const {
|
||||
MachineIRBuilder &B = Helper.MIRBuilder;
|
||||
GISelChangeObserver &Observer = Helper.Observer;
|
||||
|
||||
Register Dst = MI.getOperand(0).getReg();
|
||||
LLT Ty = B.getMRI()->getType(Dst);
|
||||
unsigned Size = Ty.getSizeInBits();
|
||||
@ -4146,6 +4157,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
|
||||
|
||||
Observer.changingInstr(MI);
|
||||
|
||||
if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
|
||||
Ty = getBitcastRegisterType(Ty);
|
||||
Helper.bitcastDst(MI, Ty, 0);
|
||||
Dst = MI.getOperand(0).getReg();
|
||||
B.setInsertPt(B.getMBB(), MI);
|
||||
}
|
||||
|
||||
// FIXME: We don't really need this intermediate instruction. The intrinsic
|
||||
// should be fixed to have a memory operand. Since it's readnone, we're not
|
||||
// allowed to add one.
|
||||
@ -4167,8 +4185,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
|
||||
// always be legal. We may need to restore this to a 96-bit result if it turns
|
||||
// out this needs to be converted to a vector load during RegBankSelect.
|
||||
if (!isPowerOf2_32(Size)) {
|
||||
LegalizerHelper Helper(MF, *this, Observer, B);
|
||||
|
||||
if (Ty.isVector())
|
||||
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
|
||||
else
|
||||
@ -4360,7 +4376,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_s_buffer_load:
|
||||
return legalizeSBufferLoad(MI, B, Helper.Observer);
|
||||
return legalizeSBufferLoad(Helper, MI);
|
||||
case Intrinsic::amdgcn_raw_buffer_store:
|
||||
case Intrinsic::amdgcn_struct_buffer_store:
|
||||
return legalizeBufferStore(MI, MRI, B, false, false);
|
||||
|
@ -167,9 +167,7 @@ public:
|
||||
GISelChangeObserver &Observer,
|
||||
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
|
||||
|
||||
bool legalizeSBufferLoad(
|
||||
MachineInstr &MI, MachineIRBuilder &B,
|
||||
GISelChangeObserver &Observer) const;
|
||||
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
|
||||
|
||||
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
|
||||
bool IsInc) const;
|
||||
|
@ -67,9 +67,10 @@ body: |
|
||||
; GCN-LABEL: name: s_buffer_load_v6s16
|
||||
; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
|
||||
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s16>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
|
||||
; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s16>), 0
|
||||
; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>)
|
||||
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
|
||||
; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
|
||||
; GCN: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[EXTRACT]](<3 x s32>)
|
||||
; GCN: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>)
|
||||
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:_(s32) = G_CONSTANT i32 0
|
||||
%2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
|
||||
@ -124,13 +125,83 @@ body: |
|
||||
; GCN-LABEL: name: s_buffer_load_v12s8
|
||||
; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
|
||||
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<16 x s8>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
|
||||
; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<16 x s8>), 0
|
||||
; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>)
|
||||
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
|
||||
; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
|
||||
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
|
||||
; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
|
||||
; GCN: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
|
||||
; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; GCN: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
|
||||
; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
|
||||
; GCN: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32)
|
||||
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
|
||||
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
|
||||
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
|
||||
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
|
||||
; GCN: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
|
||||
; GCN: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
|
||||
; GCN: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
|
||||
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
|
||||
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
|
||||
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
|
||||
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
|
||||
; GCN: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
|
||||
; GCN: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
|
||||
; GCN: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
|
||||
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
|
||||
; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
|
||||
; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32)
|
||||
; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
|
||||
; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
|
||||
; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
|
||||
; GCN: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]]
|
||||
; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
|
||||
; GCN: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C4]]
|
||||
; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
|
||||
; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
|
||||
; GCN: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
|
||||
; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
|
||||
; GCN: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]]
|
||||
; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32)
|
||||
; GCN: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]]
|
||||
; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
|
||||
; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
|
||||
; GCN: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
|
||||
; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32)
|
||||
; GCN: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]]
|
||||
; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32)
|
||||
; GCN: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]]
|
||||
; GCN: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
|
||||
; GCN: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
|
||||
; GCN: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
|
||||
; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32)
|
||||
; GCN: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]]
|
||||
; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
|
||||
; GCN: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C4]]
|
||||
; GCN: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32)
|
||||
; GCN: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
|
||||
; GCN: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
|
||||
; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
|
||||
; GCN: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C4]]
|
||||
; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GCN: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C4]]
|
||||
; GCN: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32)
|
||||
; GCN: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]]
|
||||
; GCN: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
|
||||
; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GCN: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C4]]
|
||||
; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
|
||||
; GCN: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C4]]
|
||||
; GCN: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32)
|
||||
; GCN: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]]
|
||||
; GCN: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
|
||||
; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
|
||||
; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
|
||||
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:_(s32) = G_CONSTANT i32 0
|
||||
%2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
|
||||
S_ENDPGM 0, implicit %2
|
||||
%3:_(<12 x s16>) = G_ANYEXT %2
|
||||
S_ENDPGM 0, implicit %3
|
||||
|
||||
...
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user