[AMDGPU] Add amdgpu_gfx calling convention

Add a calling convention called amdgpu_gfx for real function calls within graphics shaders. For the moment, this uses the same calling convention as other calls in amdgpu, with registers excluded for return address, stack pointer and stack buffer descriptor. Differential Revision: https://reviews.llvm.org/D88540
2025-01-31 12:41:49 +01:00 · 2020-09-16 14:38:54 +02:00 · 2020-09-16 14:38:54 +02:00 · 7e4be9501b
commit 7e4be9501b
parent 376c9070ec
29 changed files with 8324 additions and 101 deletions
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@ -8116,6 +8116,25 @@ The following table illustrates the required format:
  the top 32 bits of the pipeline, so the shader may use the program
  counter's top 32 bits.

+.. _pal_call-convention:
+
+Call Convention
+~~~~~~~~~~~~~~~
+
+For graphics use cases, the calling convention is `amdgpu_gfx`.
+
+.. note::
+
+  `amdgpu_gfx` Function calls are currently in development and are
+  subject to major changes.
+
+This calling convention shares most properties with calling non-kernel
+functions (see
+:ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions`).
+Differences are:
+
+ - Currently there are none, differences will be listed here
+
 Unspecified OS
 --------------

--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@ -241,6 +241,9 @@ namespace CallingConv {
    /// The remainder matches the regular calling convention.
    WASM_EmscriptenInvoke = 99,

+    /// Calling convention used for AMD graphics targets.
+    AMDGPU_Gfx = 100,
+
    /// The highest possible calling convention ID. Must be some 2^k - 1.
    MaxID = 1023
  };
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@ -624,6 +624,7 @@ lltok::Kind LLLexer::LexIdentifier() {
  KEYWORD(amdgpu_ps);
  KEYWORD(amdgpu_cs);
  KEYWORD(amdgpu_kernel);
+  KEYWORD(amdgpu_gfx);
  KEYWORD(tailcc);

  KEYWORD(cc);
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@ -2134,6 +2134,7 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
  case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
  case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
  case lltok::kw_amdgpu_vs:      CC = CallingConv::AMDGPU_VS; break;
+  case lltok::kw_amdgpu_gfx:     CC = CallingConv::AMDGPU_Gfx; break;
  case lltok::kw_amdgpu_ls:      CC = CallingConv::AMDGPU_LS; break;
  case lltok::kw_amdgpu_hs:      CC = CallingConv::AMDGPU_HS; break;
  case lltok::kw_amdgpu_es:      CC = CallingConv::AMDGPU_ES; break;
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@ -170,6 +170,7 @@ enum Kind {
  kw_amdgpu_ps,
  kw_amdgpu_cs,
  kw_amdgpu_kernel,
+  kw_amdgpu_gfx,
  kw_tailcc,

  // Attributes:
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@ -399,6 +399,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
  case CallingConv::AMDGPU_PS:     Out << "amdgpu_ps"; break;
  case CallingConv::AMDGPU_CS:     Out << "amdgpu_cs"; break;
  case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
+  case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
  }
 }

--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -456,7 +456,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
    Info = analyzeResourceUsage(MF);
  }

-  if (STM.isAmdPalOS())
+  if (STM.isAmdPalOS() && MFI->isEntryFunction())
    EmitPALMetadata(MF, CurrentProgramInfo);
  else if (!STM.isAmdHsaOS()) {
    EmitProgramInfoSI(MF, CurrentProgramInfo);
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@ -460,8 +460,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,

  CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
  const bool IsShader = AMDGPU::isShader(CC);
-  const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
-                         AMDGPU::isKernel(CC);
+  const bool IsWaveEnd =
+      (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
  if (IsWaveEnd) {
    B.buildInstr(AMDGPU::S_ENDPGM)
      .addImm(0);
@ -785,7 +785,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
  if (CC == CallingConv::AMDGPU_KERNEL)
    return lowerFormalArgumentsKernel(B, F, VRegs);

-  const bool IsShader = AMDGPU::isShader(CC);
+  const bool IsGraphics = AMDGPU::isGraphics(CC);
  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);

  MachineFunction &MF = B.getMF();
@ -826,7 +826,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
    const bool InReg = Arg.hasAttribute(Attribute::InReg);

    // SGPR arguments to functions not implemented.
-    if (!IsShader && InReg)
+    if (!IsGraphics && InReg)
      return false;

    if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@ -937,7 +937,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(

  // Start adding system SGPRs.
  if (IsEntryFunc) {
-    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
  } else {
    CCInfo.AllocateReg(Info->getScratchRSrcReg());
    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@ -1131,11 +1131,6 @@ static bool addCallTargetOperands(MachineInstrBuilder &CallInst,

 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                   CallLoweringInfo &Info) const {
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
-    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
-    return false;
-  }
-
  if (Info.IsVarArg) {
    LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
    return false;
@ -1149,8 +1144,15 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
  const DataLayout &DL = F.getParent()->getDataLayout();
+  CallingConv::ID CallConv = F.getCallingConv();

-  if (AMDGPU::isShader(F.getCallingConv())) {
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
+    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
+    return false;
+  }
+
+  if (AMDGPU::isShader(CallConv)) {
    LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
    return false;
  }
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@ -16,7 +16,75 @@ class CCIfExtend<CCAction A>
  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;

 // Calling convention for SI
-def CC_SI : CallingConv<[
+def CC_SI_Gfx : CallingConv<[
+  // 0-3 are reserved for the stack buffer descriptor
+  // 30-31 are reserved for the return address
+  // 32 is reserved for the stack pointer
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+  ]>>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>,
+
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def RetCC_SI_Gfx : CallingConv<[
+  // 0-3 are reserved for the stack buffer descriptor
+  // 32 is reserved for the stack pointer
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+    SGPR40, SGPR41, SGPR42, SGPR43
+  ]>>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>>,
+
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def CC_SI_SHADER : CallingConv<[

  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
@ -161,7 +229,7 @@ def CC_AMDGPU : CallingConv<[
   CCIf<"static_cast<const GCNSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_SI>>,
+        CCDelegateTo<CC_SI_SHADER>>,
   CCIf<"static_cast<const GCNSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -942,6 +942,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
  case CallingConv::Fast:
  case CallingConv::Cold:
    return CC_AMDGPU_Func;
+  case CallingConv::AMDGPU_Gfx:
+    return CC_SI_Gfx;
  case CallingConv::AMDGPU_KERNEL:
  case CallingConv::SPIR_KERNEL:
  default:
@ -963,6 +965,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
  case CallingConv::AMDGPU_ES:
  case CallingConv::AMDGPU_LS:
    return RetCC_SI_Shader;
+  case CallingConv::AMDGPU_Gfx:
+    return RetCC_SI_Gfx;
  case CallingConv::C:
  case CallingConv::Fast:
  case CallingConv::Cold:
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -829,31 +829,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
  }
 }

-static bool isArgPassedInSGPR(const Argument *A) {
-  const Function *F = A->getParent();
-
-  // Arguments to compute shaders are never a source of divergence.
-  CallingConv::ID CC = F->getCallingConv();
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return true;
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_LS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_ES:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    // For non-compute shaders, SGPR inputs are marked with either inreg.
-    // Everything else is in VGPRs.
-    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg);
-  default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
-  }
-}
-
 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
 /// this is analyzing the collective result of all output registers. Otherwise,
 /// this is only querying a specific result index if this returns multiple
@ -910,7 +885,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
 /// different across workitems in a wavefront.
 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
  if (const Argument *A = dyn_cast<Argument>(V))
-    return !isArgPassedInSGPR(A);
+    return !AMDGPU::isArgPassedInSGPR(A);

  // Loads from the private and flat address spaces are divergent, because
  // threads can execute the load instruction with the same inputs and get
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -76,7 +76,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
  const GCNSubtarget *ST;
  const SITargetLowering *TLI;
  AMDGPUTTIImpl CommonTTI;
-  bool IsGraphicsShader;
+  bool IsGraphics;
  bool HasFP32Denormals;
  bool HasFP64FP16Denormals;
  unsigned MaxVGPRs;
@ -142,7 +142,7 @@ public:
      : BaseT(TM, F.getParent()->getDataLayout()),
        ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
        TLI(ST->getTargetLowering()), CommonTTI(TM, F),
-        IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
+        IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
        MaxVGPRs(ST->getMaxNumVGPRs(
            std::max(ST->getWavesPerEU(F).first,
                     ST->getWavesPerEUForWorkGroup(
@ -222,7 +222,7 @@ public:
  unsigned getFlatAddressSpace() const {
    // Don't bother running InferAddressSpaces pass on graphics shaders which
    // don't use flat addressing.
-    if (IsGraphicsShader)
+    if (IsGraphics)
      return -1;
    return AMDGPUAS::FLAT_ADDRESS;
  }
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -1780,12 +1780,11 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
 }

-static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
-                                   CallingConv::ID CallConv,
-                                   ArrayRef<ISD::InputArg> Ins,
-                                   BitVector &Skipped,
-                                   FunctionType *FType,
-                                   SIMachineFunctionInfo *Info) {
+static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+                               CallingConv::ID CallConv,
+                               ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
+                               FunctionType *FType,
+                               SIMachineFunctionInfo *Info) {
  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
    const ISD::InputArg *Arg = &Ins[I];

@ -2237,7 +2236,7 @@ SDValue SITargetLowering::LowerFormalArguments(
  FunctionType *FType = MF.getFunction().getFunctionType();
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

-  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
    DiagnosticInfoUnsupported NoGraphicsHSA(
        Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
    DAG.getContext()->diagnose(NoGraphicsHSA);
@ -2250,12 +2249,21 @@ SDValue SITargetLowering::LowerFormalArguments(
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                 *DAG.getContext());

-  bool IsShader = AMDGPU::isShader(CallConv);
+  bool IsGraphics = AMDGPU::isGraphics(CallConv);
  bool IsKernel = AMDGPU::isKernel(CallConv);
  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);

-  if (IsShader) {
-    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+  if (IsGraphics) {
+    assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
+           (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  }
+
+  if (CallConv == CallingConv::AMDGPU_PS) {
+    processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);

    // At least one interpolation mode must be enabled or else the GPU will
    // hang.
@ -2270,40 +2278,28 @@ SDValue SITargetLowering::LowerFormalArguments(
    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    //   enabled too.
-    if (CallConv == CallingConv::AMDGPU_PS) {
-      if ((Info->getPSInputAddr() & 0x7F) == 0 ||
-           ((Info->getPSInputAddr() & 0xF) == 0 &&
-            Info->isPSInputAllocated(11))) {
-        CCInfo.AllocateReg(AMDGPU::VGPR0);
-        CCInfo.AllocateReg(AMDGPU::VGPR1);
-        Info->markPSInputAllocated(0);
-        Info->markPSInputEnabled(0);
-      }
-      if (Subtarget->isAmdPalOS()) {
-        // For isAmdPalOS, the user does not enable some bits after compilation
-        // based on run-time states; the register values being generated here are
-        // the final ones set in hardware. Therefore we need to apply the
-        // workaround to PSInputAddr and PSInputEnable together.  (The case where
-        // a bit is set in PSInputAddr but not PSInputEnable is where the
-        // frontend set up an input arg for a particular interpolation mode, but
-        // nothing uses that input arg. Really we should have an earlier pass
-        // that removes such an arg.)
-        unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
-        if ((PsInputBits & 0x7F) == 0 ||
-            ((PsInputBits & 0xF) == 0 &&
-             (PsInputBits >> 11 & 1)))
-          Info->markPSInputEnabled(
-              countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
-      }
+    if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+        ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
+    }
+    if (Subtarget->isAmdPalOS()) {
+      // For isAmdPalOS, the user does not enable some bits after compilation
+      // based on run-time states; the register values being generated here are
+      // the final ones set in hardware. Therefore we need to apply the
+      // workaround to PSInputAddr and PSInputEnable together.  (The case where
+      // a bit is set in PSInputAddr but not PSInputEnable is where the
+      // frontend set up an input arg for a particular interpolation mode, but
+      // nothing uses that input arg. Really we should have an earlier pass
+      // that removes such an arg.)
+      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+      if ((PsInputBits & 0x7F) == 0 ||
+          ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
+        Info->markPSInputEnabled(
+            countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
    }
-
-    assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() &&
-           (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
-           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
  } else if (IsKernel) {
    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
  } else {
@ -2450,7 +2446,7 @@ SDValue SITargetLowering::LowerFormalArguments(

  // Start adding system SGPRs.
  if (IsEntryFunc) {
-    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
+    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
  } else {
    CCInfo.AllocateReg(Info->getScratchRSrcReg());
    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@ -2933,7 +2929,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    report_fatal_error("unsupported libcall legalization");

  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      !CLI.CB->getCalledFunction()) {
+      !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
    return lowerUnhandledCall(CLI, InVals,
                              "unsupported indirect call to function ");
  }
@ -2943,11 +2939,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                              "unsupported required tail call to function ");
  }

-  if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
-    // Note the issue is with the CC of the calling function, not of the call
+  if (AMDGPU::isShader(CallConv)) {
+    // Note the issue is with the CC of the called function, not of the call
    // itself.
    return lowerUnhandledCall(CLI, InVals,
-                          "unsupported call from graphics shader of function ");
+                              "unsupported call to a shader function ");
+  }
+
+  if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
+    // Only allow calls with specific calling conventions.
+    return lowerUnhandledCall(CLI, InVals,
+                              "unsupported calling convention for call from "
+                              "graphics shader of function ");
  }

  if (IsTailCall) {
@ -2978,7 +2982,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);

-  if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
    // With a fixed ABI, allocate fixed registers before user arguments.
    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
  }
@ -3112,7 +3117,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    }
  }

-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
    // Copy special input registers after user input arguments.
    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
  }
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -5093,9 +5093,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
  // scratch memory access. In both cases, the legalization never involves
  // conversion to the addr64 form.
-  if (isMIMG(MI) ||
-      (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
-       (isMUBUF(MI) || isMTBUF(MI)))) {
+  if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
+                     (isMUBUF(MI) || isMTBUF(MI)))) {
    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
      CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
--- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@ -271,6 +271,7 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF,
  FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);

  for (MachineBasicBlock &MBB : MF) {
+    assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
    MBB.addLiveIn(LowestAvailableVGPR);
    MBB.sortUniqueLiveIns();
  }
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -126,6 +126,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
  case CallingConv::C:
  case CallingConv::Fast:
  case CallingConv::Cold:
+  case CallingConv::AMDGPU_Gfx:
    return CSR_AMDGPU_HighRegs_SaveList;
  default: {
    // Dummy to not crash RegisterClassInfo.
@ -146,6 +147,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
  case CallingConv::C:
  case CallingConv::Fast:
  case CallingConv::Cold:
+  case CallingConv::AMDGPU_Gfx:
    return CSR_AMDGPU_HighRegs_RegMask;
  default:
    return nullptr;
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -1043,8 +1043,12 @@ bool isShader(CallingConv::ID cc) {
  }
 }

+bool isGraphics(CallingConv::ID cc) {
+  return isShader(cc) || cc == CallingConv::AMDGPU_Gfx;
+}
+
 bool isCompute(CallingConv::ID cc) {
-  return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+  return !isGraphics(cc) || cc == CallingConv::AMDGPU_CS;
 }

 bool isEntryFunctionCC(CallingConv::ID CC) {
@ -1439,6 +1443,7 @@ bool isArgPassedInSGPR(const Argument *A) {
  case CallingConv::AMDGPU_GS:
  case CallingConv::AMDGPU_PS:
  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_Gfx:
    // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
    // Everything else is in VGPRs.
    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@ -567,6 +567,9 @@ unsigned getInitialPSInputAddr(const Function &F);
 LLVM_READNONE
 bool isShader(CallingConv::ID CC);

+LLVM_READNONE
+bool isGraphics(CallingConv::ID CC);
+
 LLVM_READNONE
 bool isCompute(CallingConv::ID CC);

--- a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@ -736,6 +736,8 @@ static const char *getStageName(CallingConv::ID CC) {
    return ".hs";
  case CallingConv::AMDGPU_LS:
    return ".ls";
+  case CallingConv::AMDGPU_Gfx:
+    llvm_unreachable("Callable shader has no hardware stage");
  default:
    return ".cs";
  }
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@ -478,6 +478,8 @@ declare cc90 void @f.cc90()
 ; CHECK: declare amdgpu_cs void @f.cc90()
 declare amdgpu_cs void @f.amdgpu_cs()
 ; CHECK: declare amdgpu_cs void @f.amdgpu_cs()
+declare amdgpu_gfx void @f.amdgpu_gfx()
+; CHECK: declare amdgpu_gfx void @f.amdgpu_gfx()
 declare cc91 void @f.cc91()
 ; CHECK: declare amdgpu_kernel void @f.cc91()
 declare amdgpu_kernel void @f.amdgpu_kernel()
--- a/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+
+; amdgpu_gfx calling convention
+declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32(i32) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 }) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg) #0
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_void
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
+  call amdgpu_gfx void @external_gfx_void_func_void()
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32
+  ; CHECK:   $vgpr0 = COPY [[C]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]]
+  call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $sgpr4, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
+  ; CHECK:   $sgpr4 = COPY [[C]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]]
+  call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
+  ; CHECK:   $sgpr4 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $sgpr5 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
--- a/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@ -56,6 +56,13 @@ declare [5 x i8] @external_a5i8_func_void() #0
 ; return value and argument
 declare hidden i32 @external_i32_func_i32(i32) #0

+; amdgpu_gfx calling convention
+declare i1 @external_gfx_i1_func_void() #0
+declare i8 @external_gfx_i8_func_void() #0
+declare i32 @external_gfx_i32_func_void() #0
+declare { i32, i64 } @external_gfx_i32_i64_func_void() #0
+declare hidden i32 @external_gfx_i32_func_i32(i32) #0
+

 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
  ; GCN-LABEL: name: test_call_external_i32_func_i32_imm
@ -115,6 +122,55 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i32_func_i32_imm
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_i32
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   $vgpr0 = COPY [[C]](s32)
+  ; GCN:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY18]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY20]](s32), [[MV]](p1) :: (volatile store 4 into %ir.out, addrspace 1)
+  ; GCN:   [[COPY21:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+  ; GCN:   S_SETPC_B64_return [[COPY21]]
+  %val = call amdgpu_gfx i32 @external_gfx_i32_func_i32(i32 42)
+  store volatile i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
  ; GCN-LABEL: name: test_call_external_i1_func_void
  ; GCN: bb.1 (%ir-block.0):
@ -171,6 +227,52 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i1_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i1_func_void
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; GCN:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+  ; GCN:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.0, align 16, addrspace 5)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[LOAD]](s1), [[DEF]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY18]]
+  %val = call amdgpu_gfx i1 @external_gfx_i1_func_void()
+  store volatile i1 %val, i1 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
  ; GCN-LABEL: name: test_call_external_i1_zeroext_func_void
  ; GCN: bb.1 (%ir-block.0):
@ -344,6 +446,53 @@ define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i8_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i8_func_void
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32)
+  ; GCN:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY19]]
+  %val = call amdgpu_gfx i8 @external_gfx_i8_func_void()
+  store volatile i8 %val, i8 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
  ; GCN-LABEL: name: test_call_external_i8_zeroext_func_void
  ; GCN: bb.1 (%ir-block.0):
@ -689,6 +838,51 @@ define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_void
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY18]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY19]]
+  %val = call amdgpu_gfx i32 @external_gfx_i32_func_void()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i48_func_void() #0 {
  ; GCN-LABEL: name: test_call_external_i48_func_void
  ; GCN: bb.1 (%ir-block.0):
@ -2362,6 +2556,59 @@ define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i32_i64_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_i64_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY17]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+  ; GCN:   G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[MV]](s64), [[COPY9]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY22:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY22]]
+  %val = call amdgpu_gfx { i32, i64 } @external_gfx_i32_i64_func_void()
+  %val.0 = extractvalue { i32, i64 } %val, 0
+  %val.1 = extractvalue { i32, i64 } %val, 1
+  store volatile i32 %val.0, i32 addrspace(1)* undef
+  store volatile i64 %val.1, i64 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 {
  ; GCN-LABEL: name: test_call_external_a2i32_func_void
  ; GCN: bb.1 (%ir-block.0):
--- a/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@ -87,6 +87,13 @@ declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x
 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
    <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0

+; amdgpu_gfx calling convention
+declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32(i32) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 }) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg) #0
+
 define amdgpu_kernel void @test_call_external_void_func_void() #0 {
  ; CHECK-LABEL: name: test_call_external_void_func_void
  ; CHECK: bb.1 (%ir-block.0):
@ -138,6 +145,47 @@ define amdgpu_kernel void @test_call_external_void_func_void() #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_void
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK:   S_SETPC_B64_return [[COPY18]]
+  call amdgpu_gfx void @external_gfx_void_func_void()
+  ret void
+}
+
 define void @test_func_call_external_void_func_void() #0 {
  ; CHECK-LABEL: name: test_func_call_external_void_func_void
  ; CHECK: bb.1 (%ir-block.0):
@ -844,6 +892,94 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $vgpr0 = COPY [[C]](s32)
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY16]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY17]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+  ; CHECK:   S_SETPC_B64_return [[COPY19]]
+  call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $sgpr4, $sgpr5, $sgpr14, $sgpr15, $vgpr31, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr12_sgpr13
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; CHECK:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $sgpr15 = COPY [[C]](s32)
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY16]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY17]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+  ; CHECK:   S_SETPC_B64_return [[COPY19]]
+  call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
  ; CHECK-LABEL: name: test_call_external_void_func_i64_imm
  ; CHECK: bb.1 (%ir-block.0):
@ -3744,6 +3880,110 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
  ret void
 }

+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK:   S_SETPC_B64_return [[COPY18]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $sgpr15 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $sgpr16 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr15, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK:   S_SETPC_B64_return [[COPY18]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
  ; CHECK-LABEL: name: test_call_external_void_func_byval_struct_i8_i32
  ; CHECK: bb.1 (%ir-block.0):
--- a/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@ -52,3 +52,46 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(void()* %fptr) {
  call void %fptr()
  ret void
 }
+
+define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) {
+  ; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:sreg_64(p0) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY17]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY18]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[MV]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+  ; CHECK:   S_SETPC_B64_return [[COPY20]]
+  call amdgpu_gfx void %fptr()
+  ret void
+}
--- a/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/test/CodeGen/AMDGPU/amdpal-callable.ll
@ -0,0 +1,16 @@
+; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+
+; GCN-LABEL: {{^}}gfx_callable_amdpal:
+; GCN:         .amdgpu_pal_metadata
+; GCN-NEXT: ---
+; GCN-NEXT: amdpal.pipelines:
+; GCN-NEXT:   - .registers:      {}
+; GCN-NEXT: ...
+; GCN-NEXT:         .end_amdgpu_pal_metadata
+define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) {
+  %add = fadd half %arg0, 1.0
+  ret half %add
+}
--- a/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
--- a/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@ -0,0 +1,835 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+declare hidden amdgpu_gfx void @external_void_func_void() #0
+
+define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    s_getpc_b64 s[34:35]
+; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "", ""() #0
+  call amdgpu_gfx void @external_void_func_void()
+  ret void
+}
+
+define amdgpu_gfx void @void_func_void_clobber_s30_s31() #1 {
+; GFX9-LABEL: void_func_void_clobber_s30_s31:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; clobber
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: void_func_void_clobber_s30_s31:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; clobber
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_mayclobber_s31:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    s_mov_b32 s34, s31
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    s_mov_b32 s31, s34
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_mayclobber_s31:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_mov_b32 s34, s31
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    s_mov_b32 s31, s34
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_mayclobber_v31:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def v31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v40
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_mayclobber_v31:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def v31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v40, v31
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v31, v40
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
+  ret void
+}
+
+
+define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_preserves_s33:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s33
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s33
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_preserves_s33:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s33
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s33
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s33 = call i32 asm sideeffect "; def $0", "={s33}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_preserves_s34:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s34
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s34
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_preserves_s34:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s34
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s34
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s34 = call i32 asm sideeffect "; def $0", "={s34}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s34}"(i32 %s34)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_preserves_v40:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def v40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_preserves_v40:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def v40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %v40 = call i32 asm sideeffect "; def $0", "={v40}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
+  ret void
+}
+
+define hidden void @void_func_void_clobber_s33() #1 {
+; GFX9-LABEL: void_func_void_clobber_s33:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_writelane_b32 v0, s33, 0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; clobber
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s33, v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: void_func_void_clobber_s33:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_writelane_b32 v0, s33, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; clobber
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_readlane_b32 s33, v0, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber", "~{s33}"() #0
+  ret void
+}
+
+define hidden void @void_func_void_clobber_s34() #1 {
+; GFX9-LABEL: void_func_void_clobber_s34:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_writelane_b32 v0, s34, 0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; clobber
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s34, v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: void_func_void_clobber_s34:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_writelane_b32 v0, s34, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; clobber
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_readlane_b32 s34, v0, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber", "~{s34}"() #0
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
+; GFX9-LABEL: test_call_void_func_void_clobber_s33:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_clobber_s33:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @void_func_void_clobber_s33()
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
+; GFX9-LABEL: test_call_void_func_void_clobber_s34:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_clobber_s34:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @void_func_void_clobber_s34()
+  ret void
+}
+
+define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
+; GFX9-LABEL: callee_saved_sgpr_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: callee_saved_sgpr_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  ret void
+}
+
+define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
+; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v41, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v41, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def v32
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 2
+; GFX9-NEXT:    v_mov_b32_e32 v40, v32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s5, v41, 2
+; GFX9-NEXT:    v_readlane_b32 s40, v41, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v41, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v41, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v41, s40, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def v32
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v40, v32
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s5, v41, 2
+; GFX10-NEXT:    v_readlane_b32 s40, v41, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v41, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noinline }
--- a/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/test/CodeGen/AMDGPU/indirect-call.ll
@ -577,3 +577,54 @@ bb1:
 bb2:
  ret void
 }
+
+define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
+; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_writelane_b32 v42, s33, 6
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v42, s34, 0
+; GCN-NEXT:    v_writelane_b32 v42, s35, 1
+; GCN-NEXT:    v_writelane_b32 v42, s36, 2
+; GCN-NEXT:    v_writelane_b32 v42, s37, 3
+; GCN-NEXT:    v_writelane_b32 v42, s30, 4
+; GCN-NEXT:    v_writelane_b32 v42, s31, 5
+; GCN-NEXT:    v_mov_b32_e32 v41, v1
+; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s6, v40
+; GCN-NEXT:    v_readfirstlane_b32 s7, v41
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41]
+; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_cbranch_execnz BB6_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_readlane_b32 s4, v42, 4
+; GCN-NEXT:    v_readlane_b32 s5, v42, 5
+; GCN-NEXT:    v_readlane_b32 s37, v42, 3
+; GCN-NEXT:    v_readlane_b32 s36, v42, 2
+; GCN-NEXT:    v_readlane_b32 s35, v42, 1
+; GCN-NEXT:    v_readlane_b32 s34, v42, 0
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    v_readlane_b32 s33, v42, 6
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void %fptr(i32 inreg 123)
+  ret void
+}
--- a/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/test/CodeGen/AMDGPU/unsupported-calls.ll
@ -1,4 +1,5 @@
-; RUN: not llc -march=amdgcn -mtriple=amdgcn-- -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: not llc -march=amdgcn -mtriple=amdgcn-mesa-mesa3d -tailcallopt < %s 2>&1 | FileCheck -check-prefixes=GCN,MESA %s
+; RUN: not llc -march=amdgcn -mtriple=amdgcn--amdpal -tailcallopt < %s 2>&1 | FileCheck -check-prefixes=GCN,PAL %s
 ; RUN: not llc -march=r600 -mtriple=r600-- -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s

 declare i32 @external_function(i32) nounwind
@ -68,13 +69,20 @@ define void @test_indirect_call(void()* %fptr) {
  ret void
 }

-; GCN: :0:0: in function test_call_from_shader i32 (): unsupported call from graphics shader of function defined_function
-; R600: in function test_call{{.*}}: unsupported call to function defined_function
-define amdgpu_ps i32 @test_call_from_shader() {
+; GCN: :0:0: in function test_c_call_from_shader i32 (): unsupported calling convention for call from graphics shader of function defined_function
+; R600: in function test_c_call{{.*}}: unsupported call to function defined_function
+define amdgpu_ps i32 @test_c_call_from_shader() {
  %call = call i32 @defined_function(i32 0)
  ret i32 %call
 }

+; GCN-NOT: in function test_gfx_call{{.*}}unsupported
+; R600: in function test_gfx_call{{.*}}: unsupported call to function defined_function
+define amdgpu_ps i32 @test_gfx_call_from_shader() {
+  %call = call amdgpu_gfx i32 @defined_function(i32 0)
+  ret i32 %call
+}
+
 ; FIXME: Bad error message
 ; GCN: error: <unknown>:0:0: in function test_call_absolute void (): unsupported indirect call to function <unknown>
 ; R600: error: <unknown>:0:0: in function test_call_absolute void (): unsupported call to function <unknown>