llvm-mirror/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp

//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//==-----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cstddef>
#include <deque>
#include <iterator>
#include <map>
#include <utility>
#include <vector>

using namespace llvm;

#define DEBUG_TYPE "structcfg"

#define DEFAULT_VEC_SLOTS 8

// TODO: move-begin.

//===----------------------------------------------------------------------===//
//
// Statistics for CFGStructurizer.
//
//===----------------------------------------------------------------------===//

STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
    "matched");
STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
    "matched");
STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");

namespace llvm {

void initializeAMDGPUCFGStructurizerPass(PassRegistry &);

} // end namespace llvm

namespace {

//===----------------------------------------------------------------------===//
//
// Miscellaneous utility for CFGStructurizer.
//
//===----------------------------------------------------------------------===//

#define SHOWNEWINSTR(i) \
  DEBUG(dbgs() << "New instr: " << *i << "\n");

#define SHOWNEWBLK(b, msg) \
DEBUG( \
  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
  dbgs() << "\n"; \
);

#define SHOWBLK_DETAIL(b, msg) \
DEBUG( \
  if (b) { \
  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
  b->print(dbgs()); \
  dbgs() << "\n"; \
  } \
);

#define INVALIDSCCNUM -1

//===----------------------------------------------------------------------===//
//
// supporting data structure for CFGStructurizer
//
//===----------------------------------------------------------------------===//

class BlockInformation {
public:
  bool IsRetired = false;
  int SccNum = INVALIDSCCNUM;

  BlockInformation() = default;
};

//===----------------------------------------------------------------------===//
//
// CFGStructurizer
//
//===----------------------------------------------------------------------===//

class AMDGPUCFGStructurizer : public MachineFunctionPass {
public:
  using MBBVector = SmallVector<MachineBasicBlock *, 32>;
  using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
  using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>;

  enum PathToKind {
    Not_SinglePath = 0,
    SinglePath_InPath = 1,
    SinglePath_NotInPath = 2
  };

  static char ID;

  AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
  }

  StringRef getPassName() const override {
    return "AMDGPU Control Flow Graph structurizer Pass";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<MachineDominatorTree>();
    AU.addRequired<MachinePostDominatorTree>();
    AU.addRequired<MachineLoopInfo>();
    MachineFunctionPass::getAnalysisUsage(AU);
  }

  /// Perform the CFG structurization
  bool run();

  /// Perform the CFG preparation
  /// This step will remove every unconditionnal/dead jump instructions and make
  /// sure all loops have an exit block
  bool prepare();

  bool runOnMachineFunction(MachineFunction &MF) override {
    TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
    TRI = &TII->getRegisterInfo();
    DEBUG(MF.dump(););
    OrderedBlks.clear();
    Visited.clear();
    FuncRep = &MF;
    MLI = &getAnalysis<MachineLoopInfo>();
    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
    MDT = &getAnalysis<MachineDominatorTree>();
    DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
    PDT = &getAnalysis<MachinePostDominatorTree>();
    DEBUG(PDT->print(dbgs()););
    prepare();
    run();
    DEBUG(MF.dump(););
    return true;
  }

protected:
  MachineDominatorTree *MDT;
  MachinePostDominatorTree *PDT;
  MachineLoopInfo *MLI;
  const R600InstrInfo *TII = nullptr;
  const R600RegisterInfo *TRI = nullptr;

  // PRINT FUNCTIONS
  /// Print the ordered Blocks.
  void printOrderedBlocks() const {
    size_t i = 0;
    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
      dbgs() << "BB" << (*iterBlk)->getNumber();
      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
      if (i != 0 && i % 10 == 0) {
        dbgs() << "\n";
      } else {
        dbgs() << " ";
      }
    }
  }

  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
    for (MachineLoop::iterator iter = LoopInfo.begin(),
         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
      (*iter)->print(dbgs(), 0);
    }
  }

  // UTILITY FUNCTIONS
  int getSCCNum(MachineBasicBlock *MBB) const;
  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
  bool hasBackEdge(MachineBasicBlock *MBB) const;
  bool isRetiredBlock(MachineBasicBlock *MBB) const;
  bool isActiveLoophead(MachineBasicBlock *MBB) const;
  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
      bool AllowSideEntry = true) const;
  int countActiveBlock(MBBVector::const_iterator It,
      MBBVector::const_iterator E) const;
  bool needMigrateBlock(MachineBasicBlock *MBB) const;

  // Utility Functions
  void reversePredicateSetter(MachineBasicBlock::iterator I,
                              MachineBasicBlock &MBB);
  /// Compute the reversed DFS post order of Blocks
  void orderBlocks(MachineFunction *MF);

  // Function originally from CFGStructTraits
  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
                      const DebugLoc &DL = DebugLoc());
  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
                                  const DebugLoc &DL = DebugLoc());
  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
                              const DebugLoc &DL);
  void insertCondBranchBefore(MachineBasicBlock *MBB,
                              MachineBasicBlock::iterator I, int NewOpcode,
                              int RegNum, const DebugLoc &DL);

  static int getBranchNzeroOpcode(int OldOpcode);
  static int getBranchZeroOpcode(int OldOpcode);
  static int getContinueNzeroOpcode(int OldOpcode);
  static int getContinueZeroOpcode(int OldOpcode);
  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
      MachineInstr *MI);
  static bool isCondBranch(MachineInstr *MI);
  static bool isUncondBranch(MachineInstr *MI);
  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);

  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
  ///
  /// BB with backward-edge could have move instructions after the branch
  /// instruction.  Such move instruction "belong to" the loop backward-edge.
  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);

  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
  static bool isReturnBlock(MachineBasicBlock *MBB);
  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
      MachineBasicBlock *SrcMBB);
  static MachineBasicBlock *clone(MachineBasicBlock *MBB);

  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
  /// because the AMDGPU instruction is not recognized as terminator fix this
  /// and retire this routine
  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);

  static void wrapup(MachineBasicBlock *MBB);

  int patternMatch(MachineBasicBlock *MBB);
  int patternMatchGroup(MachineBasicBlock *MBB);
  int serialPatternMatch(MachineBasicBlock *MBB);
  int ifPatternMatch(MachineBasicBlock *MBB);
  int loopendPatternMatch();
  int mergeLoop(MachineLoop *LoopRep);

  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
  /// the same loop with LoopLandInfo without explicitly keeping track of
  /// loopContBlks and loopBreakBlks, this is a method to get the information.
  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
      MachineBasicBlock *Src2MBB);
  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
      MachineBasicBlock **LandMBBPtr);
  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
      MachineBasicBlock *LandMBB, bool Detail = false);
  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
  void mergeSerialBlock(MachineBasicBlock *DstMBB,
      MachineBasicBlock *SrcMBB);

  void mergeIfthenelseBlock(MachineInstr *BranchMI,
      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
      MachineBasicBlock *LandMBB);
  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
      MachineBasicBlock *LandMBB);
  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
      MachineBasicBlock *ContMBB);

  /// normalizeInfiniteLoopExit change
  ///   B1:
  ///        uncond_br LoopHeader
  ///
  /// to
  ///   B1:
  ///        cond_br 1 LoopHeader dummyExit
  /// and return the newly added dummy exit block
  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
  void removeUnconditionalBranch(MachineBasicBlock *MBB);

  /// Remove duplicate branches instructions in a block.
  /// For instance
  /// B0:
  ///    cond_br X B1 B2
  ///    cond_br X B1 B2
  /// is transformed to
  /// B0:
  ///    cond_br X B1 B2
  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);

  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
  void removeSuccessor(MachineBasicBlock *MBB);
  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
      MachineBasicBlock *PredMBB);
  void migrateInstruction(MachineBasicBlock *SrcMBB,
      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
  void retireBlock(MachineBasicBlock *MBB);

private:
  MBBInfoMap BlockInfoMap;
  LoopLandInfoMap LLInfoMap;
  std::map<MachineLoop *, bool> Visited;
  MachineFunction *FuncRep;
  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
};

} // end anonymous namespace

char AMDGPUCFGStructurizer::ID = 0;

int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
  if (It == BlockInfoMap.end())
    return INVALIDSCCNUM;
  return (*It).second->SccNum;
}

MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
    const {
  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
  if (It == LLInfoMap.end())
    return nullptr;
  return (*It).second;
}

bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
  if (!LoopRep)
    return false;
  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
  return MBB->isSuccessor(LoopHeader);
}

bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
  if (It == BlockInfoMap.end())
    return false;
  return (*It).second->IsRetired;
}

bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
  while (LoopRep && LoopRep->getHeader() == MBB) {
    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
    if(!LoopLand)
      return true;
    if (!isRetiredBlock(LoopLand))
      return true;
    LoopRep = LoopRep->getParentLoop();
  }
  return false;
}

AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
    bool AllowSideEntry) const {
  assert(DstMBB);
  if (SrcMBB == DstMBB)
    return SinglePath_InPath;
  while (SrcMBB && SrcMBB->succ_size() == 1) {
    SrcMBB = *SrcMBB->succ_begin();
    if (SrcMBB == DstMBB)
      return SinglePath_InPath;
    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
      return Not_SinglePath;
  }
  if (SrcMBB && SrcMBB->succ_size()==0)
    return SinglePath_NotInPath;
  return Not_SinglePath;
}

int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
    MBBVector::const_iterator E) const {
  int Count = 0;
  while (It != E) {
    if (!isRetiredBlock(*It))
      ++Count;
    ++It;
  }
  return Count;
}

bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
  unsigned BlockSizeThreshold = 30;
  unsigned CloneInstrThreshold = 100;
  bool MultiplePreds = MBB && (MBB->pred_size() > 1);

  if(!MultiplePreds)
    return false;
  unsigned BlkSize = MBB->size();
  return ((BlkSize > BlockSizeThreshold) &&
      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
}

void AMDGPUCFGStructurizer::reversePredicateSetter(
    MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
  assert(I.isValid() && "Expected valid iterator");
  for (;; --I) {
    if (I == MBB.end())
      continue;
    if (I->getOpcode() == AMDGPU::PRED_X) {
      switch (I->getOperand(2).getImm()) {
      case AMDGPU::PRED_SETE_INT:
        I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
        return;
      case AMDGPU::PRED_SETNE_INT:
        I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
        return;
      case AMDGPU::PRED_SETE:
        I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
        return;
      case AMDGPU::PRED_SETNE:
        I->getOperand(2).setImm(AMDGPU::PRED_SETE);
        return;
      default:
        llvm_unreachable("PRED_X Opcode invalid!");
      }
    }
  }
}

void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
                                           int NewOpcode, const DebugLoc &DL) {
  MachineInstr *MI =
      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
  MBB->push_back(MI);
  //assume the instruction doesn't take any reg operand ...
  SHOWNEWINSTR(MI);
}

MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
                                                       int NewOpcode,
                                                       const DebugLoc &DL) {
  MachineInstr *MI =
      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
  if (MBB->begin() != MBB->end())
    MBB->insert(MBB->begin(), MI);
  else
    MBB->push_back(MI);
  SHOWNEWINSTR(MI);
  return MI;
}

MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
    MachineBasicBlock::iterator I, int NewOpcode) {
  MachineInstr *OldMI = &(*I);
  MachineBasicBlock *MBB = OldMI->getParent();
  MachineInstr *NewMBB =
      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
  MBB->insert(I, NewMBB);
  //assume the instruction doesn't take any reg operand ...
  SHOWNEWINSTR(NewMBB);
  return NewMBB;
}

void AMDGPUCFGStructurizer::insertCondBranchBefore(
    MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
  MachineInstr *OldMI = &(*I);
  MachineBasicBlock *MBB = OldMI->getParent();
  MachineFunction *MF = MBB->getParent();
  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
  MBB->insert(I, NewMI);
  MachineInstrBuilder MIB(*MF, NewMI);
  MIB.addReg(OldMI->getOperand(1).getReg(), false);
  SHOWNEWINSTR(NewMI);
  //erase later oldInstr->eraseFromParent();
}

void AMDGPUCFGStructurizer::insertCondBranchBefore(
    MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
    int RegNum, const DebugLoc &DL) {
  MachineFunction *MF = blk->getParent();
  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
  //insert before
  blk->insert(I, NewInstr);
  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
  SHOWNEWINSTR(NewInstr);
}

int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
  switch(OldOpcode) {
  case AMDGPU::JUMP_COND:
  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
  case AMDGPU::BRANCH_COND_i32:
  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
  default: llvm_unreachable("internal error");
  }
  return -1;
}

int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
  switch(OldOpcode) {
  case AMDGPU::JUMP_COND:
  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
  case AMDGPU::BRANCH_COND_i32:
  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
  default: llvm_unreachable("internal error");
  }
  return -1;
}

int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
  switch(OldOpcode) {
  case AMDGPU::JUMP_COND:
  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
  default: llvm_unreachable("internal error");
  }
  return -1;
}

int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
  switch(OldOpcode) {
  case AMDGPU::JUMP_COND:
  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
  default: llvm_unreachable("internal error");
  }
  return -1;
}

MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
  return MI->getOperand(0).getMBB();
}

void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
    MachineBasicBlock *MBB) {
  MI->getOperand(0).setMBB(MBB);
}

MachineBasicBlock *
AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
    MachineInstr *MI) {
  assert(MBB->succ_size() == 2);
  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
  MachineBasicBlock::succ_iterator Next = It;
  ++Next;
  return (*It == TrueBranch) ? *Next : *It;
}

bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
  switch (MI->getOpcode()) {
    case AMDGPU::JUMP_COND:
    case AMDGPU::BRANCH_COND_i32:
    case AMDGPU::BRANCH_COND_f32: return true;
  default:
    return false;
  }
  return false;
}

bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
  switch (MI->getOpcode()) {
  case AMDGPU::JUMP:
  case AMDGPU::BRANCH:
    return true;
  default:
    return false;
  }
  return false;
}

DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
  //get DebugLoc from the first MachineBasicBlock instruction with debug info
  DebugLoc DL;
  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
      ++It) {
    MachineInstr *instr = &(*It);
    if (instr->getDebugLoc())
      DL = instr->getDebugLoc();
  }
  return DL;
}

MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
    MachineBasicBlock *MBB) {
  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
  MachineInstr *MI = &*It;
  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
    return MI;
  return nullptr;
}

MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
    MachineBasicBlock *MBB) {
  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
      It != E; ++It) {
    // FIXME: Simplify
    MachineInstr *MI = &*It;
    if (MI) {
      if (isCondBranch(MI) || isUncondBranch(MI))
        return MI;
      else if (!TII->isMov(MI->getOpcode()))
        break;
    }
  }
  return nullptr;
}

MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
  if (It != MBB->rend()) {
    MachineInstr *instr = &(*It);
    if (instr->getOpcode() == AMDGPU::RETURN)
      return instr;
  }
  return nullptr;
}

bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
  MachineInstr *MI = getReturnInstr(MBB);
  bool IsReturn = (MBB->succ_size() == 0);
  if (MI)
    assert(IsReturn);
  else if (IsReturn)
    DEBUG(
      dbgs() << "BB" << MBB->getNumber()
             <<" is return block without RETURN instr\n";);
  return  IsReturn;
}

void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
    MachineBasicBlock *SrcMBB) {
  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
}

MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
  MachineFunction *Func = MBB->getParent();
  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
  Func->push_back(NewMBB);  //insert to function
  for (const MachineInstr &It : *MBB)
    NewMBB->push_back(Func->CloneMachineInstr(&It));
  return NewMBB;
}

void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
    MachineBasicBlock *NewBlk) {
  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
  if (BranchMI && isCondBranch(BranchMI) &&
      getTrueBranch(BranchMI) == OldMBB)
    setTrueBranch(BranchMI, NewBlk);
}

void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
  assert((!MBB->getParent()->getJumpTableInfo()
          || MBB->getParent()->getJumpTableInfo()->isEmpty())
         && "found a jump table");

   //collect continue right before endloop
   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
   MachineBasicBlock::iterator Pre = MBB->begin();
   MachineBasicBlock::iterator E = MBB->end();
   MachineBasicBlock::iterator It = Pre;
   while (It != E) {
     if (Pre->getOpcode() == AMDGPU::CONTINUE
         && It->getOpcode() == AMDGPU::ENDLOOP)
       ContInstr.push_back(&*Pre);
     Pre = It;
     ++It;
   }

   //delete continue right before endloop
   for (unsigned i = 0; i < ContInstr.size(); ++i)
      ContInstr[i]->eraseFromParent();

   // TODO to fix up jump table so later phase won't be confused.  if
   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
   // there isn't such an interface yet.  alternatively, replace all the other
   // blocks in the jump table with the entryBlk //}
}

bool AMDGPUCFGStructurizer::prepare() {
  bool Changed = false;

  //FIXME: if not reducible flow graph, make it so ???

  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);

  orderBlocks(FuncRep);

  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;

  // Add an ExitBlk to loop that don't have one
  for (MachineLoopInfo::iterator It = MLI->begin(),
       E = MLI->end(); It != E; ++It) {
    MachineLoop *LoopRep = (*It);
    MBBVector ExitingMBBs;
    LoopRep->getExitingBlocks(ExitingMBBs);

    if (ExitingMBBs.size() == 0) {
      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
      if (DummyExitBlk)
        RetBlks.push_back(DummyExitBlk);
    }
  }

  // Remove unconditional branch instr.
  // Add dummy exit block iff there are multiple returns.
  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
    MachineBasicBlock *MBB = *It;
    removeUnconditionalBranch(MBB);
    removeRedundantConditionalBranch(MBB);
    if (isReturnBlock(MBB)) {
      RetBlks.push_back(MBB);
    }
    assert(MBB->succ_size() <= 2);
  }

  if (RetBlks.size() >= 2) {
    addDummyExitBlock(RetBlks);
    Changed = true;
  }

  return Changed;
}

bool AMDGPUCFGStructurizer::run() {
  //Assume reducible CFG...
  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");

#ifdef STRESSTEST
  //Use the worse block ordering to test the algorithm.
  ReverseVector(orderedBlks);
#endif

  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
  int NumIter = 0;
  bool Finish = false;
  MachineBasicBlock *MBB;
  bool MakeProgress = false;
  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
                                        OrderedBlks.end());

  do {
    ++NumIter;
    DEBUG(
      dbgs() << "numIter = " << NumIter
             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
    );

    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
        OrderedBlks.begin();
    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
        OrderedBlks.end();

    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
        It;
    MachineBasicBlock *SccBeginMBB = nullptr;
    int SccNumBlk = 0;  // The number of active blocks, init to a
                        // maximum possible number.
    int SccNumIter;     // Number of iteration in this SCC.

    while (It != E) {
      MBB = *It;

      if (!SccBeginMBB) {
        SccBeginIter = It;
        SccBeginMBB = MBB;
        SccNumIter = 0;
        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
        DEBUG(
              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
              dbgs() << "\n";
        );
      }

      if (!isRetiredBlock(MBB))
        patternMatch(MBB);

      ++It;

      bool ContNextScc = true;
      if (It == E
          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
        // Just finish one scc.
        ++SccNumIter;
        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
          DEBUG(
            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
                   << ", sccNumIter = " << SccNumIter;
            dbgs() << "doesn't make any progress\n";
          );
          ContNextScc = true;
        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
          SccNumBlk = sccRemainedNumBlk;
          It = SccBeginIter;
          ContNextScc = false;
          DEBUG(
            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
                   << "sccNumIter = " << SccNumIter << '\n';
          );
        } else {
          // Finish the current scc.
          ContNextScc = true;
        }
      } else {
        // Continue on next component in the current scc.
        ContNextScc = false;
      }

      if (ContNextScc)
        SccBeginMBB = nullptr;
    } //while, "one iteration" over the function.

    MachineBasicBlock *EntryMBB =
        *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
    if (EntryMBB->succ_size() == 0) {
      Finish = true;
      DEBUG(
        dbgs() << "Reduce to one block\n";
      );
    } else {
      int NewnumRemainedBlk
        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
      // consider cloned blocks ??
      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
        MakeProgress = true;
        NumRemainedBlk = NewnumRemainedBlk;
      } else {
        MakeProgress = false;
        DEBUG(
          dbgs() << "No progress\n";
        );
      }
    }
  } while (!Finish && MakeProgress);

  // Misc wrap up to maintain the consistency of the Function representation.
  wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));

  // Detach retired Block, release memory.
  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
      It != E; ++It) {
    if ((*It).second && (*It).second->IsRetired) {
      assert(((*It).first)->getNumber() != -1);
      DEBUG(
        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
      );
      (*It).first->eraseFromParent();  //Remove from the parent Function.
    }
    delete (*It).second;
  }
  BlockInfoMap.clear();
  LLInfoMap.clear();

  if (!Finish) {
    DEBUG(FuncRep->viewCFG());
    report_fatal_error("IRREDUCIBLE_CFG");
  }

  return true;
}

void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
  int SccNum = 0;
  MachineBasicBlock *MBB;
  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
       ++It, ++SccNum) {
    const std::vector<MachineBasicBlock *> &SccNext = *It;
    for (std::vector<MachineBasicBlock *>::const_iterator
         blockIter = SccNext.begin(), blockEnd = SccNext.end();
         blockIter != blockEnd; ++blockIter) {
      MBB = *blockIter;
      OrderedBlks.push_back(MBB);
      recordSccnum(MBB, SccNum);
    }
  }

  // walk through all the block in func to check for unreachable
  for (auto *MBB : nodes(MF)) {
    SccNum = getSCCNum(MBB);
    if (SccNum == INVALIDSCCNUM)
      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
  }
}

int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
  int NumMatch = 0;
  int CurMatch;

  DEBUG(
        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
  );

  while ((CurMatch = patternMatchGroup(MBB)) > 0)
    NumMatch += CurMatch;

  DEBUG(
        dbgs() << "End patternMatch BB" << MBB->getNumber()
      << ", numMatch = " << NumMatch << "\n";
  );

  return NumMatch;
}

int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
  int NumMatch = 0;
  NumMatch += loopendPatternMatch();
  NumMatch += serialPatternMatch(MBB);
  NumMatch += ifPatternMatch(MBB);
  return NumMatch;
}

int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
  if (MBB->succ_size() != 1)
    return 0;

  MachineBasicBlock *childBlk = *MBB->succ_begin();
  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
    return 0;

  mergeSerialBlock(MBB, childBlk);
  ++numSerialPatternMatch;
  return 1;
}

int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
  //two edges
  if (MBB->succ_size() != 2)
    return 0;
  if (hasBackEdge(MBB))
    return 0;
  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
  if (!BranchMI)
    return 0;

  assert(isCondBranch(BranchMI));
  int NumMatch = 0;

  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
  NumMatch += serialPatternMatch(TrueMBB);
  NumMatch += ifPatternMatch(TrueMBB);
  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
  NumMatch += serialPatternMatch(FalseMBB);
  NumMatch += ifPatternMatch(FalseMBB);
  MachineBasicBlock *LandBlk;
  int Cloned = 0;

  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
  // TODO: Simplify
  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
    // Diamond pattern
    LandBlk = *TrueMBB->succ_begin();
  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
    // Triangle pattern, false is empty
    LandBlk = FalseMBB;
    FalseMBB = nullptr;
  } else if (FalseMBB->succ_size() == 1
             && *FalseMBB->succ_begin() == TrueMBB) {
    // Triangle pattern, true is empty
    // We reverse the predicate to make a triangle, empty false pattern;
    std::swap(TrueMBB, FalseMBB);
    reversePredicateSetter(MBB->end(), *MBB);
    LandBlk = FalseMBB;
    FalseMBB = nullptr;
  } else if (FalseMBB->succ_size() == 1
             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
    LandBlk = *FalseMBB->succ_begin();
  } else if (TrueMBB->succ_size() == 1
    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
    LandBlk = *TrueMBB->succ_begin();
  } else {
    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
  }

  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
  // new BB created for landBlk==NULL may introduce new challenge to the
  // reduction process.
  if (LandBlk &&
      ((TrueMBB && TrueMBB->pred_size() > 1)
      || (FalseMBB && FalseMBB->pred_size() > 1))) {
     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
  }

  if (TrueMBB && TrueMBB->pred_size() > 1) {
    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
    ++Cloned;
  }

  if (FalseMBB && FalseMBB->pred_size() > 1) {
    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
    ++Cloned;
  }

  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);

  ++numIfPatternMatch;

  numClonedBlock += Cloned;

  return 1 + Cloned + NumMatch;
}

int AMDGPUCFGStructurizer::loopendPatternMatch() {
  std::deque<MachineLoop *> NestedLoops;
  for (auto &It: *MLI)
    for (MachineLoop *ML : depth_first(It))
      NestedLoops.push_front(ML);

  if (NestedLoops.empty())
    return 0;

  // Process nested loop outside->inside (we did push_front),
  // so "continue" to a outside loop won't be mistaken as "break"
  // of the current loop.
  int Num = 0;
  for (MachineLoop *ExaminedLoop : NestedLoops) {
    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
      continue;
    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
    int NumBreak = mergeLoop(ExaminedLoop);
    if (NumBreak == -1)
      break;
    Num += NumBreak;
  }
  return Num;
}

int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
  MBBVector ExitingMBBs;
  LoopRep->getExitingBlocks(ExitingMBBs);
  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
  // We assume a single ExitBlk
  MBBVector ExitBlks;
  LoopRep->getExitBlocks(ExitBlks);
  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
    ExitBlkSet.insert(ExitBlks[i]);
  assert(ExitBlkSet.size() == 1);
  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
  assert(ExitBlk && "Loop has several exit block");
  MBBVector LatchBlks;
  for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
    if (LoopRep->contains(LB))
      LatchBlks.push_back(LB);

  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
    settleLoopcontBlock(LatchBlks[i], LoopHeader);
  int Match = 0;
  do {
    Match = 0;
    Match += serialPatternMatch(LoopHeader);
    Match += ifPatternMatch(LoopHeader);
  } while (Match > 0);
  mergeLooplandBlock(LoopHeader, ExitBlk);
  MachineLoop *ParentLoop = LoopRep->getParentLoop();
  if (ParentLoop)
    MLI->changeLoopFor(LoopHeader, ParentLoop);
  else
    MLI->removeBlock(LoopHeader);
  Visited[LoopRep] = true;
  return 1;
}

bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
  if (Src1MBB->succ_size() == 0) {
    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
      if (TheEntry) {
        DEBUG(
          dbgs() << "isLoopContBreakBlock yes src1 = BB"
                 << Src1MBB->getNumber()
                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
        );
        return true;
      }
    }
  }
  return false;
}

int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
  if (Num == 0) {
    DEBUG(
      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
    );
    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
  }
  return Num;
}

int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
  int Num = 0;
  MachineBasicBlock *DownBlk;

  //trueBlk could be the common post dominator
  DownBlk = TrueMBB;

  DEBUG(
    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
           << " true = BB" << TrueMBB->getNumber()
           << ", numSucc=" << TrueMBB->succ_size()
           << " false = BB" << FalseMBB->getNumber() << "\n";
  );

  while (DownBlk) {
    DEBUG(
      dbgs() << "check down = BB" << DownBlk->getNumber();
    );

    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
      DEBUG(
        dbgs() << " working\n";
      );

      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);

      numClonedBlock += Num;
      Num += serialPatternMatch(*HeadMBB->succ_begin());
      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
      Num += ifPatternMatch(HeadMBB);
      assert(Num > 0);

      break;
    }
    DEBUG(
      dbgs() << " not working\n";
    );
    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
  } // walk down the postDomTree

  return Num;
}

#ifndef NDEBUG
void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
  dbgs() << "head = BB" << HeadMBB->getNumber()
         << " size = " << HeadMBB->size();
  if (Detail) {
    dbgs() << "\n";
    HeadMBB->print(dbgs());
    dbgs() << "\n";
  }

  if (TrueMBB) {
    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
    if (Detail) {
      dbgs() << "\n";
      TrueMBB->print(dbgs());
      dbgs() << "\n";
    }
  }
  if (FalseMBB) {
    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
    if (Detail) {
      dbgs() << "\n";
      FalseMBB->print(dbgs());
      dbgs() << "\n";
    }
  }
  if (LandMBB) {
    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
    if (Detail) {
      dbgs() << "\n";
      LandMBB->print(dbgs());
      dbgs() << "\n";
    }
  }

  dbgs() << "\n";
}
#endif

int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
    MachineBasicBlock **LandMBBPtr) {
  bool MigrateTrue = false;
  bool MigrateFalse = false;

  MachineBasicBlock *LandBlk = *LandMBBPtr;

  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
         && (!FalseMBB || FalseMBB->succ_size() <= 1));

  if (TrueMBB == FalseMBB)
    return 0;

  MigrateTrue = needMigrateBlock(TrueMBB);
  MigrateFalse = needMigrateBlock(FalseMBB);

  if (!MigrateTrue && !MigrateFalse)
    return 0;

  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
  // have more than one predecessors.  without doing this, its predecessor
  // rather than headBlk will have undefined value in initReg.
  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
    MigrateTrue = true;
  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
    MigrateFalse = true;

  DEBUG(
    dbgs() << "before improveSimpleJumpintoIf: ";
    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
  );

  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
  //
  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
  //      {initReg = 0; org falseBlk branch }
  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
  //      => org landBlk
  //      if landBlk->pred_size() > 2, put the about if-else inside
  //      if (initReg !=2) {...}
  //
  // add initReg = initVal to headBlk

  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
  if (!MigrateTrue || !MigrateFalse) {
    // XXX: We have an opportunity here to optimize the "branch into if" case
    // here.  Branch into if looks like this:
    //                        entry
    //                       /     |
    //           diamond_head       branch_from
    //             /      \           |
    // diamond_false        diamond_true
    //             \      /
    //               done
    //
    // The diamond_head block begins the "if" and the diamond_true block
    // is the block being "branched into".
    //
    // If MigrateTrue is true, then TrueBB is the block being "branched into"
    // and if MigrateFalse is true, then FalseBB is the block being
    // "branched into"
    //
    // Here is the pseudo code for how I think the optimization should work:
    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
    // 3. Move the branch instruction from diamond_head into its own basic
    //    block (new_block).
    // 4. Add an unconditional branch from diamond_head to new_block
    // 5. Replace the branch instruction in branch_from with an unconditional
    //    branch to new_block.  If branch_from has multiple predecessors, then
    //    we need to replace the True/False block in the branch
    //    instruction instead of replacing it.
    // 6. Change the condition of the branch instruction in new_block from
    //    COND to (COND || GPR0)
    //
    // In order insert these MOV instruction, we will need to use the
    // RegisterScavenger.  Usually liveness stops being tracked during
    // the late machine optimization passes, however if we implement
    // bool TargetRegisterInfo::requiresRegisterScavenging(
    //                                                const MachineFunction &MF)
    // and have it return true, liveness will be tracked correctly
    // by generic optimization passes.  We will also need to make sure that
    // all of our target-specific passes that run after regalloc and before
    // the CFGStructurizer track liveness and we will need to modify this pass
    // to correctly track liveness.
    //
    // After the above changes, the new CFG should look like this:
    //                        entry
    //                       /     |
    //           diamond_head       branch_from
    //                       \     /
    //                      new_block
    //                      /      |
    //         diamond_false        diamond_true
    //                      \      /
    //                        done
    //
    // Without this optimization, we are forced to duplicate the diamond_true
    // block and we will end up with a CFG like this:
    //
    //                        entry
    //                       /     |
    //           diamond_head       branch_from
    //             /      \                   |
    // diamond_false        diamond_true      diamond_true (duplicate)
    //             \      /                   |
    //               done --------------------|
    //
    // Duplicating diamond_true can be very costly especially if it has a
    // lot of instructions.
    return 0;
  }

  int NumNewBlk = 0;

  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);

  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);

  if (LandBlkHasOtherPred) {
    report_fatal_error("Extra register needed to handle CFG");
    unsigned CmpResReg =
      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
    report_fatal_error("Extra compare instruction needed to handle CFG");
    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
        CmpResReg, DebugLoc());
  }

  // XXX: We are running this after RA, so creating virtual registers will
  // cause an assertion failure in the PostRA scheduling pass.
  unsigned InitReg =
    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
      DebugLoc());

  if (MigrateTrue) {
    migrateInstruction(TrueMBB, LandBlk, I);
    // need to uncondionally insert the assignment to ensure a path from its
    // predecessor rather than headBlk has valid value in initReg if
    // (initVal != 1).
    report_fatal_error("Extra register needed to handle CFG");
  }
  insertInstrBefore(I, AMDGPU::ELSE);

  if (MigrateFalse) {
    migrateInstruction(FalseMBB, LandBlk, I);
    // need to uncondionally insert the assignment to ensure a path from its
    // predecessor rather than headBlk has valid value in initReg if
    // (initVal != 0)
    report_fatal_error("Extra register needed to handle CFG");
  }

  if (LandBlkHasOtherPred) {
    // add endif
    insertInstrBefore(I, AMDGPU::ENDIF);

    // put initReg = 2 to other predecessors of landBlk
    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
         PE = LandBlk->pred_end(); PI != PE; ++PI) {
      MachineBasicBlock *MBB = *PI;
      if (MBB != TrueMBB && MBB != FalseMBB)
        report_fatal_error("Extra register needed to handle CFG");
    }
  }
  DEBUG(
    dbgs() << "result from improveSimpleJumpintoIf: ";
    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
  );

  // update landBlk
  *LandMBBPtr = LandBlk;

  return NumNewBlk;
}

void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
    MachineBasicBlock *SrcMBB) {
  DEBUG(
    dbgs() << "serialPattern BB" << DstMBB->getNumber()
           << " <= BB" << SrcMBB->getNumber() << "\n";
  );
  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());

  DstMBB->removeSuccessor(SrcMBB, true);
  cloneSuccessorList(DstMBB, SrcMBB);

  removeSuccessor(SrcMBB);
  MLI->removeBlock(SrcMBB);
  retireBlock(SrcMBB);
}

void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
  assert (TrueMBB);
  DEBUG(
    dbgs() << "ifPattern BB" << MBB->getNumber();
    dbgs() << "{  ";
    if (TrueMBB) {
      dbgs() << "BB" << TrueMBB->getNumber();
    }
    dbgs() << "  } else ";
    dbgs() << "{  ";
    if (FalseMBB) {
      dbgs() << "BB" << FalseMBB->getNumber();
    }
    dbgs() << "  }\n ";
    dbgs() << "landBlock: ";
    if (!LandMBB) {
      dbgs() << "NULL";
    } else {
      dbgs() << "BB" << LandMBB->getNumber();
    }
    dbgs() << "\n";
  );

  int OldOpcode = BranchMI->getOpcode();
  DebugLoc BranchDL = BranchMI->getDebugLoc();

//    transform to
//    if cond
//       trueBlk
//    else
//       falseBlk
//    endif
//    landBlk

  MachineBasicBlock::iterator I = BranchMI;
  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
      BranchDL);

  if (TrueMBB) {
    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
    MBB->removeSuccessor(TrueMBB, true);
    if (LandMBB && TrueMBB->succ_size()!=0)
      TrueMBB->removeSuccessor(LandMBB, true);
    retireBlock(TrueMBB);
    MLI->removeBlock(TrueMBB);
  }

  if (FalseMBB) {
    insertInstrBefore(I, AMDGPU::ELSE);
    MBB->splice(I, FalseMBB, FalseMBB->begin(),
                   FalseMBB->end());
    MBB->removeSuccessor(FalseMBB, true);
    if (LandMBB && FalseMBB->succ_size() != 0)
      FalseMBB->removeSuccessor(LandMBB, true);
    retireBlock(FalseMBB);
    MLI->removeBlock(FalseMBB);
  }
  insertInstrBefore(I, AMDGPU::ENDIF);

  BranchMI->eraseFromParent();

  if (LandMBB && TrueMBB && FalseMBB)
    MBB->addSuccessor(LandMBB);
}

void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
    MachineBasicBlock *LandMBB) {
  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
               << " land = BB" << LandMBB->getNumber() << "\n";);

  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
  DstBlk->replaceSuccessor(DstBlk, LandMBB);
}

void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
    MachineBasicBlock *LandMBB) {
  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
               << " land = BB" << LandMBB->getNumber() << "\n";);
  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
  assert(BranchMI && isCondBranch(BranchMI));
  DebugLoc DL = BranchMI->getDebugLoc();
  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
  MachineBasicBlock::iterator I = BranchMI;
  if (TrueBranch != LandMBB)
    reversePredicateSetter(I, *I->getParent());
  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
  insertInstrBefore(I, AMDGPU::BREAK);
  insertInstrBefore(I, AMDGPU::ENDIF);
  //now branchInst can be erase safely
  BranchMI->eraseFromParent();
  //now take care of successors, retire blocks
  ExitingMBB->removeSuccessor(LandMBB, true);
}

void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
    MachineBasicBlock *ContMBB) {
  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
               << ContingMBB->getNumber()
               << ", cont = BB" << ContMBB->getNumber() << "\n";);

  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
  if (MI) {
    assert(isCondBranch(MI));
    MachineBasicBlock::iterator I = MI;
    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
    int OldOpcode = MI->getOpcode();
    DebugLoc DL = MI->getDebugLoc();

    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);

    if (!UseContinueLogical) {
      int BranchOpcode =
          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
          getBranchZeroOpcode(OldOpcode);
      insertCondBranchBefore(I, BranchOpcode, DL);
      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
    } else {
      int BranchOpcode =
          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
          getContinueZeroOpcode(OldOpcode);
      insertCondBranchBefore(I, BranchOpcode, DL);
    }

    MI->eraseFromParent();
  } else {
    // if we've arrived here then we've already erased the branch instruction
    // travel back up the basic block to see the last reference of our debug
    // location we've just inserted that reference here so it should be
    // representative insertEnd to ensure phi-moves, if exist, go before the
    // continue-instr.
    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
        getLastDebugLocInBB(ContingMBB));
  }
}

int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
  int Cloned = 0;
  assert(PreMBB->isSuccessor(SrcMBB));
  while (SrcMBB && SrcMBB != DstMBB) {
    assert(SrcMBB->succ_size() == 1);
    if (SrcMBB->pred_size() > 1) {
      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
      ++Cloned;
    }

    PreMBB = SrcMBB;
    SrcMBB = *SrcMBB->succ_begin();
  }

  return Cloned;
}

MachineBasicBlock *
AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
    MachineBasicBlock *PredMBB) {
  assert(PredMBB->isSuccessor(MBB) &&
         "succBlk is not a prececessor of curBlk");

  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
  //srcBlk, oldBlk, newBlk

  PredMBB->replaceSuccessor(MBB, CloneMBB);

  // add all successor to cloneBlk
  cloneSuccessorList(CloneMBB, MBB);

  numClonedInstr += MBB->size();

  DEBUG(
    dbgs() << "Cloned block: " << "BB"
           << MBB->getNumber() << "size " << MBB->size() << "\n";
  );

  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");

  return CloneMBB;
}

void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
  MachineBasicBlock::iterator SpliceEnd;
  //look for the input branchinstr, not the AMDGPU branchinstr
  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
  if (!BranchMI) {
    DEBUG(
      dbgs() << "migrateInstruction don't see branch instr\n";
    );
    SpliceEnd = SrcMBB->end();
  } else {
    DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
    SpliceEnd = BranchMI;
  }
  DEBUG(
    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
      << "srcSize = " << SrcMBB->size() << "\n";
  );

  //splice insert before insertPos
  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);

  DEBUG(
    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
      << "srcSize = " << SrcMBB->size() << '\n';
  );
}

MachineBasicBlock *
AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();

  if (!LoopHeader || !LoopLatch)
    return nullptr;
  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
  // Is LoopRep an infinite loop ?
  if (!BranchMI || !isUncondBranch(BranchMI))
    return nullptr;

  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
  FuncRep->push_back(DummyExitBlk);  //insert to function
  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
  LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
  Ctx.emitError("Extra register needed to handle CFG");
  return nullptr;
}

void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
  MachineInstr *BranchMI;

  // I saw two unconditional branch in one basic block in example
  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
          && isUncondBranch(BranchMI)) {
    DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
    BranchMI->eraseFromParent();
  }
}

void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
    MachineBasicBlock *MBB) {
  if (MBB->succ_size() != 2)
    return;
  MachineBasicBlock *MBB1 = *MBB->succ_begin();
  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
  if (MBB1 != MBB2)
    return;

  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
  assert(BranchMI && isCondBranch(BranchMI));
  DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
  BranchMI->eraseFromParent();
  SHOWNEWBLK(MBB1, "Removing redundant successor");
  MBB->removeSuccessor(MBB1, true);
}

void AMDGPUCFGStructurizer::addDummyExitBlock(
    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
  FuncRep->push_back(DummyExitBlk);  //insert to function
  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);

  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
       E = RetMBB.end(); It != E; ++It) {
    MachineBasicBlock *MBB = *It;
    MachineInstr *MI = getReturnInstr(MBB);
    if (MI)
      MI->eraseFromParent();
    MBB->addSuccessor(DummyExitBlk);
    DEBUG(
      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
             << " successors\n";
    );
  }
  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
}

void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
  while (MBB->succ_size())
    MBB->removeSuccessor(*MBB->succ_begin());
}

void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
    int SccNum) {
  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
  if (!srcBlkInfo)
    srcBlkInfo = new BlockInformation();
  srcBlkInfo->SccNum = SccNum;
}

void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
  DEBUG(
        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
  );

  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];

  if (!SrcBlkInfo)
    SrcBlkInfo = new BlockInformation();

  SrcBlkInfo->IsRetired = true;
  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
         && "can't retire block yet");
}

INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
                      "AMDGPU CFG Structurizer", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
                      "AMDGPU CFG Structurizer", false, false)

FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
  return new AMDGPUCFGStructurizer();
}