[AMDGPU] Add A16/G16 to InstCombine

When sampling from images with coordinates that only have 16 bit accuracy, convert the image intrinsic call to use a16 or g16. This does only happen if the target hardware supports it. An alternative would be to always apply this combination, independent of the target hardware and extend 16 bit arguments to 32 bit arguments during legalization. To me, this sounds like an unnecessary roundtrip that could prevent some further InstCombine optimizations. Differential Revision: https://reviews.llvm.org/D85887
2024-10-19 11:02:59 +02:00 · 2020-06-04 12:22:36 +02:00 · 2020-06-04 12:22:36 +02:00 · b1fe63844a
commit b1fe63844a
parent cfcda8d055
6 changed files with 1373 additions and 13 deletions
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -682,10 +682,16 @@ class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RES
 class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
  int NumDataArgs = !size(P_.DataArgs);
  int NumDmaskArgs = !if(P_.IsAtomic, 0, 1);
+  int NumExtraAddrArgs = !size(P_.ExtraAddrArgs);
  int NumVAddrArgs = !size(P_.AddrArgs);
+  int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
+  int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
  int NumRSrcArgs = 1;
  int NumSampArgs = !if(P_.IsSample, 2, 0);
  int DmaskArgIndex = NumDataArgs;
+  int VAddrArgIndex = !add(NumDataArgs, NumDmaskArgs);
+  int GradientArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs);
+  int CoordArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs, NumGradientArgs);
  int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1);
  int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs);
  int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
--- a/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@ -53,12 +53,119 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
  return maxnum(Src0, Src1);
 }

+// Check if a value can be converted to a 16-bit value without losing
+// precision.
+static bool canSafelyConvertTo16Bit(Value &V) {
+  Type *VTy = V.getType();
+  if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
+    // The value is already 16-bit, so we don't want to convert to 16-bit again!
+    return false;
+  }
+  if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+    // We need to check that if we cast the index down to a half, we do not lose
+    // precision.
+    APFloat FloatValue(ConstFloat->getValueAPF());
+    bool LosesInfo = true;
+    FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
+    return !LosesInfo;
+  }
+  Value *CastSrc;
+  if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
+      match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
+      match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+    Type *CastSrcTy = CastSrc->getType();
+    if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
+      return true;
+  }
+
+  return false;
+}
+
+// Convert a value to 16-bit.
+Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
+  Type *VTy = V.getType();
+  if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
+    return cast<Instruction>(&V)->getOperand(0);
+  if (VTy->isIntegerTy())
+    return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
+  if (VTy->isFloatingPointTy())
+    return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
+
+  llvm_unreachable("Should never be called!");
+}
+
+static Optional<Instruction *>
+simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
+                             const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
+                             IntrinsicInst &II, InstCombiner &IC) {
+  if (!ST->hasA16() && !ST->hasG16())
+    return None;
+
+  bool FloatCoord = false;
+  // true means derivatives can be converted to 16 bit, coordinates not
+  bool OnlyDerivatives = false;
+
+  for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+       OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
+    Value *Coord = II.getOperand(OperandIndex);
+    // If the values are not derived from 16-bit values, we cannot optimize.
+    if (!canSafelyConvertTo16Bit(*Coord)) {
+      if (OperandIndex < ImageDimIntr->CoordStart ||
+          ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
+        return None;
+      }
+      // All gradients can be converted, so convert only them
+      OnlyDerivatives = true;
+      break;
+    }
+
+    assert(OperandIndex == ImageDimIntr->GradientStart ||
+           FloatCoord == Coord->getType()->isFloatingPointTy());
+    FloatCoord = Coord->getType()->isFloatingPointTy();
+  }
+
+  if (OnlyDerivatives) {
+    if (!ST->hasG16())
+      return None;
+  } else {
+    if (!ST->hasA16())
+      OnlyDerivatives = true; // Only supports G16
+  }
+
+  Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
+                               : Type::getInt16Ty(II.getContext());
+
+  SmallVector<Type *, 4> ArgTys;
+  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+    return None;
+
+  ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
+  if (!OnlyDerivatives)
+    ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
+  Function *I =
+      Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
+
+  SmallVector<Value *, 8> Args(II.arg_operands());
+
+  unsigned EndIndex =
+      OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
+  for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+       OperandIndex < EndIndex; OperandIndex++) {
+    Args[OperandIndex] =
+        convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
+  }
+
+  CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+  NewCall->takeName(&II);
+  NewCall->copyMetadata(II);
+  NewCall->copyFastMathFlags(&II);
+  return IC.replaceInstUsesWith(II, NewCall);
+}
+
 Optional<Instruction *>
 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  Intrinsic::ID IID = II.getIntrinsicID();
  switch (IID) {
-  default:
-    break;
  case Intrinsic::amdgcn_rcp: {
    Value *Src = II.getArgOperand(0);

@ -715,6 +822,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

    break;
  }
+  default: {
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+      return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
+    }
+  }
  }
  return None;
 }
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@ -52,6 +52,11 @@ struct ImageDimIntrinsicInfo {
  unsigned Intr;
  unsigned BaseOpcode;
  MIMGDim Dim;
+  unsigned GradientStart;
+  unsigned CoordStart;
+  unsigned VAddrEnd;
+  unsigned GradientTyArg;
+  unsigned CoordTyArg;
 };
 const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);

--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@ -840,11 +840,19 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
  Intrinsic Intr = I;
  MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
  AMDGPUDimProps Dim = I.P.Dim;
+  AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
+
+  bits<8> GradientStart = DimEval.GradientArgIndex;
+  bits<8> CoordStart = DimEval.CoordArgIndex;
+  bits<8> VAddrEnd = !add(DimEval.VAddrArgIndex, DimEval.NumVAddrArgs);
+  bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
+    !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
+  bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
 }

 def ImageDimIntrinsicTable : GenericTable {
  let FilterClass = "ImageDimIntrinsicInfo";
-  let Fields = ["Intr", "BaseOpcode", "Dim"];
+  let Fields = ["Intr", "BaseOpcode", "Dim", "GradientStart", "CoordStart", "VAddrEnd", "GradientTyArg", "CoordTyArg"];
  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
  GenericEnum TypeOf_Dim = MIMGDim;

--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll
@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -instcombine -S < %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample a16 is disabled on pre-gfx9
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+define amdgpu_kernel void @image_sample_a16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
+; CHECK-LABEL: @image_sample_a16_1d(
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s32 = fpext half %s to float
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
+; CHECK-LABEL: @image_sample_a16_2d(
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s32 = fpext half %s to float
+  %t32 = fpext half %t to float
+  %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
+; CHECK-LABEL: @image_sample_a16_3d(
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT:    [[R32:%.*]] = fpext half [[R:%.*]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[R32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s32 = fpext half %s to float
+  %t32 = fpext half %t to float
+  %r32 = fpext half %r to float
+  %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_cube(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
+;
+; CHECK-LABEL: @image_sample_a16_cube(
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT:    [[FACE32:%.*]] = fpext half [[FACE:%.*]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[FACE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s32 = fpext half %s to float
+  %t32 = fpext half %t to float
+  %face32 = fpext half %face to float
+  %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_1darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
+; CHECK-LABEL: @image_sample_a16_1darray(
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float [[S32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s32 = fpext half %s to float
+  %slice32 = fpext half %slice to float
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_2darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
+; CHECK-LABEL: @image_sample_a16_2darray(
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s32 = fpext half %s to float
+  %t32 = fpext half %t to float
+  %slice32 = fpext half %slice to float
+  %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll