llvm-mirror/lib/Target/R600/R600MachineScheduler.cpp

//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief R600 Machine Scheduler interface
// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "misched"

#include "R600MachineScheduler.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/Pass.h"
#include "llvm/PassManager.h"
#include "llvm/Support/raw_ostream.h"
#include <set>

using namespace llvm;

void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {

  DAG = dag;
  TII = static_cast<const R600InstrInfo*>(DAG->TII);
  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
  MRI = &DAG->MRI;
  Available[IDAlu]->clear();
  Available[IDFetch]->clear();
  Available[IDOther]->clear();
  CurInstKind = IDOther;
  CurEmitted = 0;
  OccupedSlotsMask = 15;
  memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate));
  InstKindLimit[IDAlu] = 120; // 120 minus 8 for security


  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {
    InstKindLimit[IDFetch] = 7; // 8 minus 1 for security
  } else {
    InstKindLimit[IDFetch] = 15; // 16 minus 1 for security
  }
}

void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
{
  if (QSrc->empty())
    return;
  for (ReadyQueue::iterator I = QSrc->begin(),
      E = QSrc->end(); I != E; ++I) {
    (*I)->NodeQueueId &= ~QSrc->getID();
    QDst->push(*I);
  }
  QSrc->clear();
}

SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
  SUnit *SU = 0;
  IsTopNode = true;
  NextInstKind = IDOther;

  // check if we might want to switch current clause type
  bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
      (CurEmitted > InstKindLimit[CurInstKind]) ||
      (Available[CurInstKind]->empty());
  bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
      (!Available[IDFetch]->empty() || !Available[IDOther]->empty());

  if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
      (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
    // try to pick ALU
    SU = pickAlu();
    if (SU) {
      if (CurEmitted >  InstKindLimit[IDAlu])
        CurEmitted = 0;
      NextInstKind = IDAlu;
    }
  }

  if (!SU) {
    // try to pick FETCH
    SU = pickOther(IDFetch);
    if (SU)
      NextInstKind = IDFetch;
  }

  // try to pick other
  if (!SU) {
    SU = pickOther(IDOther);
    if (SU)
      NextInstKind = IDOther;
  }

  DEBUG(
      if (SU) {
        dbgs() << "picked node: ";
        SU->dump(DAG);
      } else {
        dbgs() << "NO NODE ";
        for (int i = 0; i < IDLast; ++i) {
          Available[i]->dump();
          Pending[i]->dump();
        }
        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
          const SUnit &S = DAG->SUnits[i];
          if (!S.isScheduled)
            S.dump(DAG);
        }
      }
  );

  return SU;
}

void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {

  DEBUG(dbgs() << "scheduled: ");
  DEBUG(SU->dump(DAG));

  if (NextInstKind != CurInstKind) {
    DEBUG(dbgs() << "Instruction Type Switch\n");
    if (NextInstKind != IDAlu)
      OccupedSlotsMask = 15;
    CurEmitted = 0;
    CurInstKind = NextInstKind;
  }

  if (CurInstKind == IDAlu) {
    switch (getAluKind(SU)) {
    case AluT_XYZW:
      CurEmitted += 4;
      break;
    case AluDiscarded:
      break;
    default: {
      ++CurEmitted;
      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
          E = SU->getInstr()->operands_end(); It != E; ++It) {
        MachineOperand &MO = *It;
        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
          ++CurEmitted;
      }
    }
    }
  } else {
    ++CurEmitted;
  }


  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");

  if (CurInstKind != IDFetch) {
    MoveUnits(Pending[IDFetch], Available[IDFetch]);
  }
  MoveUnits(Pending[IDOther], Available[IDOther]);
}

void R600SchedStrategy::releaseTopNode(SUnit *SU) {
  int IK = getInstKind(SU);

  DEBUG(dbgs() << IK << " <= ");
  DEBUG(SU->dump(DAG));

  Pending[IK]->push(SU);
}

void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
}

bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
                                          const TargetRegisterClass *RC) const {
  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
    return RC->contains(Reg);
  } else {
    return MRI->getRegClass(Reg) == RC;
  }
}

R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
  MachineInstr *MI = SU->getInstr();

    switch (MI->getOpcode()) {
    case AMDGPU::INTERP_PAIR_XY:
    case AMDGPU::INTERP_PAIR_ZW:
    case AMDGPU::INTERP_VEC_LOAD:
      return AluT_XYZW;
    case AMDGPU::COPY:
      if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
        // %vregX = COPY Tn_X is likely to be discarded in favor of an
        // assignement of Tn_X to %vregX, don't considers it in scheduling
        return AluDiscarded;
      }
      else if (MI->getOperand(1).isUndef()) {
        // MI will become a KILL, don't considers it in scheduling
        return AluDiscarded;
      }
    default:
      break;
    }

    // Does the instruction take a whole IG ?
    if(TII->isVector(*MI) ||
        TII->isCubeOp(MI->getOpcode()) ||
        TII->isReductionOp(MI->getOpcode()))
      return AluT_XYZW;

    // Is the result already assigned to a channel ?
    unsigned DestSubReg = MI->getOperand(0).getSubReg();
    switch (DestSubReg) {
    case AMDGPU::sub0:
      return AluT_X;
    case AMDGPU::sub1:
      return AluT_Y;
    case AMDGPU::sub2:
      return AluT_Z;
    case AMDGPU::sub3:
      return AluT_W;
    default:
      break;
    }

    // Is the result already member of a X/Y/Z/W class ?
    unsigned DestReg = MI->getOperand(0).getReg();
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
      return AluT_X;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
      return AluT_Y;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
      return AluT_Z;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
      return AluT_W;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
      return AluT_XYZW;

    return AluAny;

}

int R600SchedStrategy::getInstKind(SUnit* SU) {
  int Opcode = SU->getInstr()->getOpcode();

  if (TII->isALUInstr(Opcode)) {
    return IDAlu;
  }

  switch (Opcode) {
  case AMDGPU::COPY:
  case AMDGPU::CONST_COPY:
  case AMDGPU::INTERP_PAIR_XY:
  case AMDGPU::INTERP_PAIR_ZW:
  case AMDGPU::INTERP_VEC_LOAD:
  case AMDGPU::DOT4_eg_pseudo:
  case AMDGPU::DOT4_r600_pseudo:
    return IDAlu;
  case AMDGPU::TEX_VTX_CONSTBUF:
  case AMDGPU::TEX_VTX_TEXBUF:
  case AMDGPU::TEX_LD:
  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
  case AMDGPU::TEX_GET_GRADIENTS_H:
  case AMDGPU::TEX_GET_GRADIENTS_V:
  case AMDGPU::TEX_SET_GRADIENTS_H:
  case AMDGPU::TEX_SET_GRADIENTS_V:
  case AMDGPU::TEX_SAMPLE:
  case AMDGPU::TEX_SAMPLE_C:
  case AMDGPU::TEX_SAMPLE_L:
  case AMDGPU::TEX_SAMPLE_C_L:
  case AMDGPU::TEX_SAMPLE_LB:
  case AMDGPU::TEX_SAMPLE_C_LB:
  case AMDGPU::TEX_SAMPLE_G:
  case AMDGPU::TEX_SAMPLE_C_G:
  case AMDGPU::TXD:
  case AMDGPU::TXD_SHADOW:
    return IDFetch;
  default:
    DEBUG(
        dbgs() << "other inst: ";
        SU->dump(DAG);
    );
    return IDOther;
  }
}

class ConstPairs {
private:
  unsigned XYPair;
  unsigned ZWPair;
public:
  ConstPairs(unsigned ReadConst[3]) : XYPair(0), ZWPair(0) {
    for (unsigned i = 0; i < 3; i++) {
      unsigned ReadConstChan = ReadConst[i] & 3;
      unsigned ReadConstIndex = ReadConst[i] & (~3);
      if (ReadConstChan < 2) {
        if (!XYPair) {
          XYPair = ReadConstIndex;
        }
      } else {
        if (!ZWPair) {
          ZWPair = ReadConstIndex;
        }
      }
    }
  }

  bool isCompatibleWith(const ConstPairs& CP) const {
    return (!XYPair || !CP.XYPair || CP.XYPair == XYPair) &&
        (!ZWPair || !CP.ZWPair || CP.ZWPair == ZWPair);
  }
};

static
const ConstPairs getPairs(const R600InstrInfo *TII, const MachineInstr& MI) {
  unsigned ReadConsts[3] = {0, 0, 0};
  R600Operands::Ops OpTable[3][2] = {
    {R600Operands::SRC0, R600Operands::SRC0_SEL},
    {R600Operands::SRC1, R600Operands::SRC1_SEL},
    {R600Operands::SRC2, R600Operands::SRC2_SEL},
  };

  if (!TII->isALUInstr(MI.getOpcode()))
    return ConstPairs(ReadConsts);

  for (unsigned i = 0; i < 3; i++) {
    int SrcIdx = TII->getOperandIdx(MI.getOpcode(), OpTable[i][0]);
    if (SrcIdx < 0)
      break;
    if (MI.getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST)
      ReadConsts[i] =MI.getOperand(
          TII->getOperandIdx(MI.getOpcode(), OpTable[i][1])).getImm();
  }
  return ConstPairs(ReadConsts);
}

bool
R600SchedStrategy::isBundleable(const MachineInstr& MI) {
  const ConstPairs &MIPair = getPairs(TII, MI);
  for (unsigned i = 0; i < 4; i++) {
    if (!InstructionsGroupCandidate[i])
      continue;
    const ConstPairs &IGPair = getPairs(TII,
        *InstructionsGroupCandidate[i]->getInstr());
    if (!IGPair.isCompatibleWith(MIPair))
      return false;
  }
  return true;
}

SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
  if (Q.empty())
    return NULL;
  for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();
      It != E; ++It) {
    SUnit *SU = *It;
    if (isBundleable(*SU->getInstr())) {
      Q.erase(It);
      return SU;
    }
  }
  return NULL;
}

void R600SchedStrategy::LoadAlu() {
  ReadyQueue *QSrc = Pending[IDAlu];
  for (ReadyQueue::iterator I = QSrc->begin(),
        E = QSrc->end(); I != E; ++I) {
      (*I)->NodeQueueId &= ~QSrc->getID();
      AluKind AK = getAluKind(*I);
      AvailableAlus[AK].insert(*I);
    }
    QSrc->clear();
}

void R600SchedStrategy::PrepareNextSlot() {
  DEBUG(dbgs() << "New Slot\n");
  assert (OccupedSlotsMask && "Slot wasn't filled");
  OccupedSlotsMask = 0;
  memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate));
  LoadAlu();
}

void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
  unsigned DestReg = MI->getOperand(0).getReg();
  // PressureRegister crashes if an operand is def and used in the same inst
  // and we try to constraint its regclass
  for (MachineInstr::mop_iterator It = MI->operands_begin(),
      E = MI->operands_end(); It != E; ++It) {
    MachineOperand &MO = *It;
    if (MO.isReg() && !MO.isDef() &&
        MO.getReg() == MI->getOperand(0).getReg())
      return;
  }
  // Constrains the regclass of DestReg to assign it to Slot
  switch (Slot) {
  case 0:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
    break;
  case 1:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
    break;
  case 2:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
    break;
  case 3:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
    break;
  }
}

SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
  if (!UnslotedSU) {
    return SlotedSU;
  } else if (!SlotedSU) {
    AssignSlot(UnslotedSU->getInstr(), Slot);
    return UnslotedSU;
  } else {
    //Determine which one to pick (the lesser one)
    if (CompareSUnit()(SlotedSU, UnslotedSU)) {
      AvailableAlus[AluAny].insert(UnslotedSU);
      return SlotedSU;
    } else {
      AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
      AssignSlot(UnslotedSU->getInstr(), Slot);
      return UnslotedSU;
    }
  }
}

bool R600SchedStrategy::isAvailablesAluEmpty() const {
  return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&
      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
}

SUnit* R600SchedStrategy::pickAlu() {
  while (!isAvailablesAluEmpty()) {
    if (!OccupedSlotsMask) {
      // Flush physical reg copies (RA will discard them)
      if (!AvailableAlus[AluDiscarded].empty()) {
        OccupedSlotsMask = 15;
        return PopInst(AvailableAlus[AluDiscarded]);
      }
      // If there is a T_XYZW alu available, use it
      if (!AvailableAlus[AluT_XYZW].empty()) {
        OccupedSlotsMask = 15;
        return PopInst(AvailableAlus[AluT_XYZW]);
      }
    }
    for (unsigned Chan = 0; Chan < 4; ++Chan) {
      bool isOccupied = OccupedSlotsMask & (1 << Chan);
      if (!isOccupied) {
        SUnit *SU = AttemptFillSlot(Chan);
        if (SU) {
          OccupedSlotsMask |= (1 << Chan);
          InstructionsGroupCandidate[Chan] = SU;
          return SU;
        }
      }
    }
    PrepareNextSlot();
  }
  return NULL;
}

SUnit* R600SchedStrategy::pickOther(int QID) {
  SUnit *SU = 0;
  ReadyQueue *AQ = Available[QID];

  if (AQ->empty()) {
    MoveUnits(Pending[QID], AQ);
  }
  if (!AQ->empty()) {
    SU = *AQ->begin();
    AQ->remove(AQ->begin());
  }
  return SU;
}
R600: initial scheduler code This is a skeleton for a pre-RA MachineInstr scheduler strategy. Currently it only tries to expose more parallelism for ALU instructions (this also makes the distribution of GPR channels more uniform and increases the chances of ALU instructions to be packed together in a single VLIW group). Also it tries to reduce clause switching by grouping instruction of the same kind (ALU/FETCH/CF) together. Vincent Lejeune: - Support for VLIW4 Slot assignement - Recomputation of ScheduleDAG to get more parallelism opportunities Tom Stellard: - Fix assertion failure when trying to determine an instruction's slot based on its destination register's class - Fix some compiler warnings Vincent Lejeune: [v2] - Remove recomputation of ScheduleDAG (will be provided in a later patch) - Improve estimation of an ALU clause size so that heuristic does not emit cf instructions at the wrong position. - Make schedule heuristic smarter using SUnit Depth - Take constant read limitations into account Vincent Lejeune: [v3] - Fix some uninitialized values in ConstPair - Add asserts to ensure an ALU slot is always populated llvm-svn: 176498 2013-03-05 19:41:32 +01:00			`//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -- C++ ------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`/// \file`
			`/// \brief R600 Machine Scheduler interface`
			`// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#define DEBUG_TYPE "misched"`

			`#include "R600MachineScheduler.h"`
			`#include "llvm/CodeGen/MachineRegisterInfo.h"`
			`#include "llvm/CodeGen/LiveIntervalAnalysis.h"`
			`#include "llvm/Pass.h"`
			`#include "llvm/PassManager.h"`
R600MachineScheduler.cpp: Fix use cases of dbgs(). Don't include <iostream> here. llvm-svn: 176797 2013-03-11 09:19:28 +01:00			`#include "llvm/Support/raw_ostream.h"`
R600: initial scheduler code This is a skeleton for a pre-RA MachineInstr scheduler strategy. Currently it only tries to expose more parallelism for ALU instructions (this also makes the distribution of GPR channels more uniform and increases the chances of ALU instructions to be packed together in a single VLIW group). Also it tries to reduce clause switching by grouping instruction of the same kind (ALU/FETCH/CF) together. Vincent Lejeune: - Support for VLIW4 Slot assignement - Recomputation of ScheduleDAG to get more parallelism opportunities Tom Stellard: - Fix assertion failure when trying to determine an instruction's slot based on its destination register's class - Fix some compiler warnings Vincent Lejeune: [v2] - Remove recomputation of ScheduleDAG (will be provided in a later patch) - Improve estimation of an ALU clause size so that heuristic does not emit cf instructions at the wrong position. - Make schedule heuristic smarter using SUnit Depth - Take constant read limitations into account Vincent Lejeune: [v3] - Fix some uninitialized values in ConstPair - Add asserts to ensure an ALU slot is always populated llvm-svn: 176498 2013-03-05 19:41:32 +01:00			`#include <set>`
R600MachineScheduler.cpp: Fix use cases of dbgs(). Don't include <iostream> here. llvm-svn: 176797 2013-03-11 09:19:28 +01:00
R600: initial scheduler code This is a skeleton for a pre-RA MachineInstr scheduler strategy. Currently it only tries to expose more parallelism for ALU instructions (this also makes the distribution of GPR channels more uniform and increases the chances of ALU instructions to be packed together in a single VLIW group). Also it tries to reduce clause switching by grouping instruction of the same kind (ALU/FETCH/CF) together. Vincent Lejeune: - Support for VLIW4 Slot assignement - Recomputation of ScheduleDAG to get more parallelism opportunities Tom Stellard: - Fix assertion failure when trying to determine an instruction's slot based on its destination register's class - Fix some compiler warnings Vincent Lejeune: [v2] - Remove recomputation of ScheduleDAG (will be provided in a later patch) - Improve estimation of an ALU clause size so that heuristic does not emit cf instructions at the wrong position. - Make schedule heuristic smarter using SUnit Depth - Take constant read limitations into account Vincent Lejeune: [v3] - Fix some uninitialized values in ConstPair - Add asserts to ensure an ALU slot is always populated llvm-svn: 176498 2013-03-05 19:41:32 +01:00			`using namespace llvm;`

			`void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {`

			`DAG = dag;`
			`TII = static_cast<const R600InstrInfo*>(DAG->TII);`
			`TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);`
			`MRI = &DAG->MRI;`
			`Available[IDAlu]->clear();`
			`Available[IDFetch]->clear();`
			`Available[IDOther]->clear();`
			`CurInstKind = IDOther;`
			`CurEmitted = 0;`
			`OccupedSlotsMask = 15;`
			`memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate));`
			`InstKindLimit[IDAlu] = 120; // 120 minus 8 for security`


			`const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();`
			`if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {`
			`InstKindLimit[IDFetch] = 7; // 8 minus 1 for security`
			`} else {`
			`InstKindLimit[IDFetch] = 15; // 16 minus 1 for security`
			`}`
			`}`

			`void R600SchedStrategy::MoveUnits(ReadyQueue QSrc, ReadyQueue QDst)`
			`{`
			`if (QSrc->empty())`
			`return;`
			`for (ReadyQueue::iterator I = QSrc->begin(),`
			`E = QSrc->end(); I != E; ++I) {`
			`(*I)->NodeQueueId &= ~QSrc->getID();`
			`QDst->push(*I);`
			`}`
			`QSrc->clear();`
			`}`

			`SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {`
			`SUnit *SU = 0;`
			`IsTopNode = true;`
			`NextInstKind = IDOther;`

			`// check if we might want to switch current clause type`
			`bool AllowSwitchToAlu = (CurInstKind == IDOther) \|\|`
			`(CurEmitted > InstKindLimit[CurInstKind]) \|\|`
			`(Available[CurInstKind]->empty());`
			`bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&`
			`(!Available[IDFetch]->empty() \|\| !Available[IDOther]->empty());`

			`if ((AllowSwitchToAlu && CurInstKind != IDAlu) \|\|`
			`(!AllowSwitchFromAlu && CurInstKind == IDAlu)) {`
			`// try to pick ALU`
			`SU = pickAlu();`
			`if (SU) {`
			`if (CurEmitted > InstKindLimit[IDAlu])`
			`CurEmitted = 0;`
			`NextInstKind = IDAlu;`
			`}`
			`}`

			`if (!SU) {`
			`// try to pick FETCH`
			`SU = pickOther(IDFetch);`
			`if (SU)`
			`NextInstKind = IDFetch;`
			`}`

			`// try to pick other`
			`if (!SU) {`
			`SU = pickOther(IDOther);`
			`if (SU)`
			`NextInstKind = IDOther;`
			`}`

			`DEBUG(`
			`if (SU) {`
			`dbgs() << "picked node: ";`
			`SU->dump(DAG);`
			`} else {`
			`dbgs() << "NO NODE ";`
			`for (int i = 0; i < IDLast; ++i) {`
			`Available[i]->dump();`
			`Pending[i]->dump();`
			`}`
			`for (unsigned i = 0; i < DAG->SUnits.size(); i++) {`
			`const SUnit &S = DAG->SUnits[i];`
			`if (!S.isScheduled)`
			`S.dump(DAG);`
			`}`
			`}`
			`);`

			`return SU;`
			`}`

			`void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {`

			`DEBUG(dbgs() << "scheduled: ");`
			`DEBUG(SU->dump(DAG));`

			`if (NextInstKind != CurInstKind) {`
			`DEBUG(dbgs() << "Instruction Type Switch\n");`
			`if (NextInstKind != IDAlu)`
			`OccupedSlotsMask = 15;`
			`CurEmitted = 0;`
			`CurInstKind = NextInstKind;`
			`}`

			`if (CurInstKind == IDAlu) {`
			`switch (getAluKind(SU)) {`
			`case AluT_XYZW:`
			`CurEmitted += 4;`
			`break;`
			`case AluDiscarded:`
			`break;`
			`default: {`
			`++CurEmitted;`
			`for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),`
			`E = SU->getInstr()->operands_end(); It != E; ++It) {`
			`MachineOperand &MO = *It;`
			`if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)`
			`++CurEmitted;`
			`}`
			`}`
			`}`
			`} else {`
			`++CurEmitted;`
			`}`


			`DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");`

			`if (CurInstKind != IDFetch) {`
			`MoveUnits(Pending[IDFetch], Available[IDFetch]);`
			`}`
			`MoveUnits(Pending[IDOther], Available[IDOther]);`
			`}`

			`void R600SchedStrategy::releaseTopNode(SUnit *SU) {`
			`int IK = getInstKind(SU);`

			`DEBUG(dbgs() << IK << " <= ");`
			`DEBUG(SU->dump(DAG));`

			`Pending[IK]->push(SU);`
			`}`

			`void R600SchedStrategy::releaseBottomNode(SUnit *SU) {`
			`}`

			`bool R600SchedStrategy::regBelongsToClass(unsigned Reg,`
			`const TargetRegisterClass *RC) const {`
			`if (!TargetRegisterInfo::isVirtualRegister(Reg)) {`
			`return RC->contains(Reg);`
			`} else {`
			`return MRI->getRegClass(Reg) == RC;`
			`}`
			`}`

			`R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {`
			`MachineInstr *MI = SU->getInstr();`

			`switch (MI->getOpcode()) {`
			`case AMDGPU::INTERP_PAIR_XY:`
			`case AMDGPU::INTERP_PAIR_ZW:`
			`case AMDGPU::INTERP_VEC_LOAD:`
			`return AluT_XYZW;`
			`case AMDGPU::COPY:`
			`if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {`
			`// %vregX = COPY Tn_X is likely to be discarded in favor of an`
			`// assignement of Tn_X to %vregX, don't considers it in scheduling`
			`return AluDiscarded;`
			`}`
			`else if (MI->getOperand(1).isUndef()) {`
			`// MI will become a KILL, don't considers it in scheduling`
			`return AluDiscarded;`
			`}`
			`default:`
			`break;`
			`}`

			`// Does the instruction take a whole IG ?`
			`if(TII->isVector(*MI) \|\|`
			`TII->isCubeOp(MI->getOpcode()) \|\|`
			`TII->isReductionOp(MI->getOpcode()))`
			`return AluT_XYZW;`

			`// Is the result already assigned to a channel ?`
			`unsigned DestSubReg = MI->getOperand(0).getSubReg();`
			`switch (DestSubReg) {`
			`case AMDGPU::sub0:`
			`return AluT_X;`
			`case AMDGPU::sub1:`
			`return AluT_Y;`
			`case AMDGPU::sub2:`
			`return AluT_Z;`
			`case AMDGPU::sub3:`
			`return AluT_W;`
			`default:`
			`break;`
			`}`

			`// Is the result already member of a X/Y/Z/W class ?`
			`unsigned DestReg = MI->getOperand(0).getReg();`
			`if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) \|\|`
			`regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))`
			`return AluT_X;`
			`if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))`
			`return AluT_Y;`
			`if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))`
			`return AluT_Z;`
			`if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))`
			`return AluT_W;`
			`if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))`
			`return AluT_XYZW;`

			`return AluAny;`

			`}`

			`int R600SchedStrategy::getInstKind(SUnit* SU) {`
			`int Opcode = SU->getInstr()->getOpcode();`

			`if (TII->isALUInstr(Opcode)) {`
			`return IDAlu;`
			`}`

			`switch (Opcode) {`
			`case AMDGPU::COPY:`
			`case AMDGPU::CONST_COPY:`
			`case AMDGPU::INTERP_PAIR_XY:`
			`case AMDGPU::INTERP_PAIR_ZW:`
			`case AMDGPU::INTERP_VEC_LOAD:`
			`case AMDGPU::DOT4_eg_pseudo:`
			`case AMDGPU::DOT4_r600_pseudo:`
			`return IDAlu;`
			`case AMDGPU::TEX_VTX_CONSTBUF:`
			`case AMDGPU::TEX_VTX_TEXBUF:`
			`case AMDGPU::TEX_LD:`
			`case AMDGPU::TEX_GET_TEXTURE_RESINFO:`
			`case AMDGPU::TEX_GET_GRADIENTS_H:`
			`case AMDGPU::TEX_GET_GRADIENTS_V:`
			`case AMDGPU::TEX_SET_GRADIENTS_H:`
			`case AMDGPU::TEX_SET_GRADIENTS_V:`
			`case AMDGPU::TEX_SAMPLE:`
			`case AMDGPU::TEX_SAMPLE_C:`
			`case AMDGPU::TEX_SAMPLE_L:`
			`case AMDGPU::TEX_SAMPLE_C_L:`
			`case AMDGPU::TEX_SAMPLE_LB:`
			`case AMDGPU::TEX_SAMPLE_C_LB:`
			`case AMDGPU::TEX_SAMPLE_G:`
			`case AMDGPU::TEX_SAMPLE_C_G:`
			`case AMDGPU::TXD:`
			`case AMDGPU::TXD_SHADOW:`
			`return IDFetch;`
			`default:`
			`DEBUG(`
			`dbgs() << "other inst: ";`
			`SU->dump(DAG);`
			`);`
			`return IDOther;`
			`}`
			`}`

			`class ConstPairs {`
			`private:`
			`unsigned XYPair;`
			`unsigned ZWPair;`
			`public:`
			`ConstPairs(unsigned ReadConst[3]) : XYPair(0), ZWPair(0) {`
			`for (unsigned i = 0; i < 3; i++) {`
			`unsigned ReadConstChan = ReadConst[i] & 3;`
			`unsigned ReadConstIndex = ReadConst[i] & (~3);`
			`if (ReadConstChan < 2) {`
			`if (!XYPair) {`
			`XYPair = ReadConstIndex;`
			`}`
			`} else {`
			`if (!ZWPair) {`
			`ZWPair = ReadConstIndex;`
			`}`
			`}`
			`}`
			`}`

			`bool isCompatibleWith(const ConstPairs& CP) const {`
			`return (!XYPair \|\| !CP.XYPair \|\| CP.XYPair == XYPair) &&`
			`(!ZWPair \|\| !CP.ZWPair \|\| CP.ZWPair == ZWPair);`
			`}`
			`};`

			`static`
			`const ConstPairs getPairs(const R600InstrInfo *TII, const MachineInstr& MI) {`
			`unsigned ReadConsts[3] = {0, 0, 0};`
			`R600Operands::Ops OpTable[3][2] = {`
			`{R600Operands::SRC0, R600Operands::SRC0_SEL},`
			`{R600Operands::SRC1, R600Operands::SRC1_SEL},`
			`{R600Operands::SRC2, R600Operands::SRC2_SEL},`
			`};`

			`if (!TII->isALUInstr(MI.getOpcode()))`
			`return ConstPairs(ReadConsts);`

			`for (unsigned i = 0; i < 3; i++) {`
			`int SrcIdx = TII->getOperandIdx(MI.getOpcode(), OpTable[i][0]);`
			`if (SrcIdx < 0)`
			`break;`
			`if (MI.getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST)`
			`ReadConsts[i] =MI.getOperand(`
			`TII->getOperandIdx(MI.getOpcode(), OpTable[i][1])).getImm();`
			`}`
			`return ConstPairs(ReadConsts);`
			`}`

			`bool`
			`R600SchedStrategy::isBundleable(const MachineInstr& MI) {`
			`const ConstPairs &MIPair = getPairs(TII, MI);`
			`for (unsigned i = 0; i < 4; i++) {`
			`if (!InstructionsGroupCandidate[i])`
			`continue;`
			`const ConstPairs &IGPair = getPairs(TII,`
			`*InstructionsGroupCandidate[i]->getInstr());`
			`if (!IGPair.isCompatibleWith(MIPair))`
			`return false;`
			`}`
			`return true;`
			`}`

			`SUnit R600SchedStrategy::PopInst(std::multiset<SUnit , CompareSUnit> &Q) {`
			`if (Q.empty())`
			`return NULL;`
			`for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();`
			`It != E; ++It) {`
			`SUnit SU = It;`
			`if (isBundleable(*SU->getInstr())) {`
			`Q.erase(It);`
			`return SU;`
			`}`
			`}`
			`return NULL;`
			`}`

			`void R600SchedStrategy::LoadAlu() {`
			`ReadyQueue *QSrc = Pending[IDAlu];`
			`for (ReadyQueue::iterator I = QSrc->begin(),`
			`E = QSrc->end(); I != E; ++I) {`
			`(*I)->NodeQueueId &= ~QSrc->getID();`
			`AluKind AK = getAluKind(*I);`
			`AvailableAlus[AK].insert(*I);`
			`}`
			`QSrc->clear();`
			`}`

			`void R600SchedStrategy::PrepareNextSlot() {`
			`DEBUG(dbgs() << "New Slot\n");`
			`assert (OccupedSlotsMask && "Slot wasn't filled");`
			`OccupedSlotsMask = 0;`
			`memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate));`
			`LoadAlu();`
			`}`

			`void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {`
			`unsigned DestReg = MI->getOperand(0).getReg();`
			`// PressureRegister crashes if an operand is def and used in the same inst`
			`// and we try to constraint its regclass`
			`for (MachineInstr::mop_iterator It = MI->operands_begin(),`
			`E = MI->operands_end(); It != E; ++It) {`
			`MachineOperand &MO = *It;`
			`if (MO.isReg() && !MO.isDef() &&`
			`MO.getReg() == MI->getOperand(0).getReg())`
			`return;`
			`}`
			`// Constrains the regclass of DestReg to assign it to Slot`
			`switch (Slot) {`
			`case 0:`
			`MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);`
			`break;`
			`case 1:`
			`MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);`
			`break;`
			`case 2:`
			`MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);`
			`break;`
			`case 3:`
			`MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);`
			`break;`
			`}`
			`}`

			`SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {`
			`static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};`
			`SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);`
			`SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);`
			`if (!UnslotedSU) {`
			`return SlotedSU;`
			`} else if (!SlotedSU) {`
			`AssignSlot(UnslotedSU->getInstr(), Slot);`
			`return UnslotedSU;`
			`} else {`
			`//Determine which one to pick (the lesser one)`
			`if (CompareSUnit()(SlotedSU, UnslotedSU)) {`
			`AvailableAlus[AluAny].insert(UnslotedSU);`
			`return SlotedSU;`
			`} else {`
			`AvailableAlus[IndexToID[Slot]].insert(SlotedSU);`
			`AssignSlot(UnslotedSU->getInstr(), Slot);`
			`return UnslotedSU;`
			`}`
			`}`
			`}`

			`bool R600SchedStrategy::isAvailablesAluEmpty() const {`
			`return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&`
			`AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&`
			`AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&`
			`AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();`
			`}`

			`SUnit* R600SchedStrategy::pickAlu() {`
			`while (!isAvailablesAluEmpty()) {`
			`if (!OccupedSlotsMask) {`
			`// Flush physical reg copies (RA will discard them)`
			`if (!AvailableAlus[AluDiscarded].empty()) {`
			`OccupedSlotsMask = 15;`
			`return PopInst(AvailableAlus[AluDiscarded]);`
			`}`
			`// If there is a T_XYZW alu available, use it`
			`if (!AvailableAlus[AluT_XYZW].empty()) {`
			`OccupedSlotsMask = 15;`
			`return PopInst(AvailableAlus[AluT_XYZW]);`
			`}`
			`}`
			`for (unsigned Chan = 0; Chan < 4; ++Chan) {`
			`bool isOccupied = OccupedSlotsMask & (1 << Chan);`
			`if (!isOccupied) {`
			`SUnit *SU = AttemptFillSlot(Chan);`
			`if (SU) {`
			`OccupedSlotsMask \|= (1 << Chan);`
			`InstructionsGroupCandidate[Chan] = SU;`
			`return SU;`
			`}`
			`}`
			`}`
			`PrepareNextSlot();`
			`}`
			`return NULL;`
			`}`

			`SUnit* R600SchedStrategy::pickOther(int QID) {`
			`SUnit *SU = 0;`
			`ReadyQueue *AQ = Available[QID];`

			`if (AQ->empty()) {`
			`MoveUnits(Pending[QID], AQ);`
			`}`
			`if (!AQ->empty()) {`
			`SU = *AQ->begin();`
			`AQ->remove(AQ->begin());`
			`}`
			`return SU;`
			`}`