1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

AMDGPU/GlobalISel: Apply load bitcast to s.buffer.load intrinsic

Should also apply this to the non-scalar buffer loads.
This commit is contained in:
Matt Arsenault 2020-06-16 14:52:14 -04:00
parent 38a46e3c56
commit 59ad55e97d
3 changed files with 121 additions and 36 deletions

View File

@ -122,20 +122,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
static LLT getBitcastRegisterType(const LLT Ty) {
const unsigned Size = Ty.getSizeInBits();
LLT CoercedTy;
if (Size <= 32) {
// <2 x s8> -> s16
// <4 x s8> -> s32
return LLT::scalar(Size);
}
return LLT::scalarOrVector(Size / 32, 32);
}
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
unsigned Size = Ty.getSizeInBits();
LLT CoercedTy;
if (Size <= 32) {
// <2 x s8> -> s16
// <4 x s8> -> s32
CoercedTy = LLT::scalar(Size);
} else
CoercedTy = LLT::scalarOrVector(Size / 32, 32);
return std::make_pair(TypeIdx, CoercedTy);
return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
};
}
@ -335,6 +338,20 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
!loadStoreBitcastWorkaround(Ty);
}
/// Return true if a load or store of the type should be lowered with a bitcast
/// to a different type.
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
const unsigned MemSizeInBits) {
const unsigned Size = Ty.getSizeInBits();
if (Size != MemSizeInBits)
return Size <= 32 && Ty.isVector();
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
return true;
return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
!isRegisterVectorElementType(Ty.getElementType());
}
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
@ -1048,16 +1065,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// 16-bit vector parts.
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
const LLT Ty = Query.Types[0];
const unsigned Size = Ty.getSizeInBits();
if (Size != Query.MMODescrs[0].SizeInBits)
return Size <= 32 && Ty.isVector();
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
return true;
return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
!isRegisterVectorElementType(Ty.getElementType());
return shouldBitcastLoadStoreType(ST, Query.Types[0],
Query.MMODescrs[0].SizeInBits);
}, bitcastToRegisterType(0));
Actions
@ -4137,8 +4146,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
MachineInstr &MI, MachineIRBuilder &B,
GISelChangeObserver &Observer) const {
LegalizerHelper &Helper, MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
GISelChangeObserver &Observer = Helper.Observer;
Register Dst = MI.getOperand(0).getReg();
LLT Ty = B.getMRI()->getType(Dst);
unsigned Size = Ty.getSizeInBits();
@ -4146,6 +4157,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
Observer.changingInstr(MI);
if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
Dst = MI.getOperand(0).getReg();
B.setInsertPt(B.getMBB(), MI);
}
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
@ -4167,8 +4185,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
if (!isPowerOf2_32(Size)) {
LegalizerHelper Helper(MF, *this, Observer, B);
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
@ -4360,7 +4376,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
case Intrinsic::amdgcn_s_buffer_load:
return legalizeSBufferLoad(MI, B, Helper.Observer);
return legalizeSBufferLoad(Helper, MI);
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store:
return legalizeBufferStore(MI, MRI, B, false, false);

View File

@ -167,9 +167,7 @@ public:
GISelChangeObserver &Observer,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
bool legalizeSBufferLoad(
MachineInstr &MI, MachineIRBuilder &B,
GISelChangeObserver &Observer) const;
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
bool IsInc) const;

View File

@ -67,9 +67,10 @@ body: |
; GCN-LABEL: name: s_buffer_load_v6s16
; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s16>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s16>), 0
; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>)
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
; GCN: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[EXTRACT]](<3 x s32>)
; GCN: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
@ -124,13 +125,83 @@ body: |
; GCN-LABEL: name: s_buffer_load_v12s8
; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<16 x s8>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<16 x s8>), 0
; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>)
; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; GCN: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GCN: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
; GCN: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32)
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
; GCN: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
; GCN: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
; GCN: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
; GCN: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
; GCN: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
; GCN: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32)
; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; GCN: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]]
; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; GCN: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C4]]
; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; GCN: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
; GCN: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]]
; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32)
; GCN: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]]
; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
; GCN: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32)
; GCN: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]]
; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32)
; GCN: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]]
; GCN: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
; GCN: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
; GCN: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32)
; GCN: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]]
; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
; GCN: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C4]]
; GCN: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32)
; GCN: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
; GCN: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
; GCN: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C4]]
; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GCN: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C4]]
; GCN: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32)
; GCN: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]]
; GCN: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GCN: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C4]]
; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
; GCN: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C4]]
; GCN: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32)
; GCN: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]]
; GCN: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
S_ENDPGM 0, implicit %2
%3:_(<12 x s16>) = G_ANYEXT %2
S_ENDPGM 0, implicit %3
...