1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[AMDGPU] Add A16/G16 to InstCombine

When sampling from images with coordinates that only have 16 bit
accuracy, convert the image intrinsic call to use a16 or g16.
This does only happen if the target hardware supports it.

An alternative would be to always apply this combination, independent of
the target hardware and extend 16 bit arguments to 32 bit arguments
during legalization. To me, this sounds like an unnecessary roundtrip
that could prevent some further InstCombine optimizations.

Differential Revision: https://reviews.llvm.org/D85887
This commit is contained in:
Sebastian Neubauer 2020-06-04 12:22:36 +02:00
parent cfcda8d055
commit b1fe63844a
6 changed files with 1373 additions and 13 deletions

View File

@ -682,10 +682,16 @@ class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RES
class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int NumDataArgs = !size(P_.DataArgs);
int NumDmaskArgs = !if(P_.IsAtomic, 0, 1);
int NumExtraAddrArgs = !size(P_.ExtraAddrArgs);
int NumVAddrArgs = !size(P_.AddrArgs);
int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
int NumRSrcArgs = 1;
int NumSampArgs = !if(P_.IsSample, 2, 0);
int DmaskArgIndex = NumDataArgs;
int VAddrArgIndex = !add(NumDataArgs, NumDmaskArgs);
int GradientArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs);
int CoordArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs, NumGradientArgs);
int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1);
int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs);
int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);

View File

@ -53,12 +53,119 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
return maxnum(Src0, Src1);
}
// Check if a value can be converted to a 16-bit value without losing
// precision.
static bool canSafelyConvertTo16Bit(Value &V) {
Type *VTy = V.getType();
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
// The value is already 16-bit, so we don't want to convert to 16-bit again!
return false;
}
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
// We need to check that if we cast the index down to a half, we do not lose
// precision.
APFloat FloatValue(ConstFloat->getValueAPF());
bool LosesInfo = true;
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
return !LosesInfo;
}
Value *CastSrc;
if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
Type *CastSrcTy = CastSrc->getType();
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
return true;
}
return false;
}
// Convert a value to 16-bit.
Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
Type *VTy = V.getType();
if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
return cast<Instruction>(&V)->getOperand(0);
if (VTy->isIntegerTy())
return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
if (VTy->isFloatingPointTy())
return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
llvm_unreachable("Should never be called!");
}
static Optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
IntrinsicInst &II, InstCombiner &IC) {
if (!ST->hasA16() && !ST->hasG16())
return None;
bool FloatCoord = false;
// true means derivatives can be converted to 16 bit, coordinates not
bool OnlyDerivatives = false;
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
Value *Coord = II.getOperand(OperandIndex);
// If the values are not derived from 16-bit values, we cannot optimize.
if (!canSafelyConvertTo16Bit(*Coord)) {
if (OperandIndex < ImageDimIntr->CoordStart ||
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
return None;
}
// All gradients can be converted, so convert only them
OnlyDerivatives = true;
break;
}
assert(OperandIndex == ImageDimIntr->GradientStart ||
FloatCoord == Coord->getType()->isFloatingPointTy());
FloatCoord = Coord->getType()->isFloatingPointTy();
}
if (OnlyDerivatives) {
if (!ST->hasG16())
return None;
} else {
if (!ST->hasA16())
OnlyDerivatives = true; // Only supports G16
}
Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
: Type::getInt16Ty(II.getContext());
SmallVector<Type *, 4> ArgTys;
if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
return None;
ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
if (!OnlyDerivatives)
ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
Function *I =
Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
SmallVector<Value *, 8> Args(II.arg_operands());
unsigned EndIndex =
OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
OperandIndex < EndIndex; OperandIndex++) {
Args[OperandIndex] =
convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
}
CallInst *NewCall = IC.Builder.CreateCall(I, Args);
NewCall->takeName(&II);
NewCall->copyMetadata(II);
NewCall->copyFastMathFlags(&II);
return IC.replaceInstUsesWith(II, NewCall);
}
Optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
default:
break;
case Intrinsic::amdgcn_rcp: {
Value *Src = II.getArgOperand(0);
@ -715,6 +822,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
}
}
}
return None;
}

View File

@ -52,6 +52,11 @@ struct ImageDimIntrinsicInfo {
unsigned Intr;
unsigned BaseOpcode;
MIMGDim Dim;
unsigned GradientStart;
unsigned CoordStart;
unsigned VAddrEnd;
unsigned GradientTyArg;
unsigned CoordTyArg;
};
const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);

View File

@ -840,11 +840,19 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
Intrinsic Intr = I;
MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
AMDGPUDimProps Dim = I.P.Dim;
AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
bits<8> GradientStart = DimEval.GradientArgIndex;
bits<8> CoordStart = DimEval.CoordArgIndex;
bits<8> VAddrEnd = !add(DimEval.VAddrArgIndex, DimEval.NumVAddrArgs);
bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
!foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
}
def ImageDimIntrinsicTable : GenericTable {
let FilterClass = "ImageDimIntrinsicInfo";
let Fields = ["Intr", "BaseOpcode", "Dim"];
let Fields = ["Intr", "BaseOpcode", "Dim", "GradientStart", "CoordStart", "VAddrEnd", "GradientTyArg", "CoordTyArg"];
GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
GenericEnum TypeOf_Dim = MIMGDim;

View File

@ -0,0 +1,108 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -instcombine -S < %s | FileCheck %s
; --------------------------------------------------------------------
; llvm.amdgcn.image.sample a16 is disabled on pre-gfx9
; --------------------------------------------------------------------
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
define amdgpu_kernel void @image_sample_a16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
; CHECK-LABEL: @image_sample_a16_1d(
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
%s32 = fpext half %s to float
%res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @image_sample_a16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; CHECK-LABEL: @image_sample_a16_2d(
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
%s32 = fpext half %s to float
%t32 = fpext half %t to float
%res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @image_sample_a16_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
; CHECK-LABEL: @image_sample_a16_3d(
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
; CHECK-NEXT: [[R32:%.*]] = fpext half [[R:%.*]] to float
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[R32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
%s32 = fpext half %s to float
%t32 = fpext half %t to float
%r32 = fpext half %r to float
%res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @image_sample_a16_cube(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
;
; CHECK-LABEL: @image_sample_a16_cube(
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
; CHECK-NEXT: [[FACE32:%.*]] = fpext half [[FACE:%.*]] to float
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[FACE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
%s32 = fpext half %s to float
%t32 = fpext half %t to float
%face32 = fpext half %face to float
%res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @image_sample_a16_1darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
; CHECK-LABEL: @image_sample_a16_1darray(
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float [[S32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
%s32 = fpext half %s to float
%slice32 = fpext half %slice to float
%res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @image_sample_a16_2darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
; CHECK-LABEL: @image_sample_a16_2darray(
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
%s32 = fpext half %s to float
%t32 = fpext half %t to float
%slice32 = fpext half %slice to float
%res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <4 x float> %res, <4 x float> addrspace(1)* %out
ret void
}

File diff suppressed because it is too large Load Diff