llvm-mirror/lib/Target/X86/X86FixupSetCC.cpp

//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines a pass that fixes zero-extension of setcc patterns.
// X86 setcc instructions are modeled to have no input arguments, and a single
// GR8 output argument. This is consistent with other similar instructions
// (e.g. movb), but means it is impossible to directly generate a setcc into
// the lower GR8 of a specified GR32.
// This means that ISel must select (zext (setcc)) into something like
// seta %al; movzbl %al, %eax.
// Unfortunately, this can cause a stall due to the partial register write
// performed by the setcc. Instead, we can use:
// xor %eax, %eax; seta %al
// This both avoids the stall, and encodes shorter.
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"

using namespace llvm;

#define DEBUG_TYPE "x86-fixup-setcc"

STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");

namespace {
class X86FixupSetCCPass : public MachineFunctionPass {
public:
  X86FixupSetCCPass() : MachineFunctionPass(ID) {}

  StringRef getPassName() const override { return "X86 Fixup SetCC"; }

  bool runOnMachineFunction(MachineFunction &MF) override;

private:
  // Find the preceding instruction that imp-defs eflags.
  MachineInstr *findFlagsImpDef(MachineBasicBlock *MBB,
                                MachineBasicBlock::reverse_iterator MI);

  // Return true if MI imp-uses eflags.
  bool impUsesFlags(MachineInstr *MI);

  // Return true if this is the opcode of a SetCC instruction with a register
  // output.
  bool isSetCCr(unsigned Opode);

  MachineRegisterInfo *MRI;
  const X86InstrInfo *TII;

  enum { SearchBound = 16 };

  static char ID;
};

char X86FixupSetCCPass::ID = 0;
}

FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }

bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
  switch (Opcode) {
  default:
    return false;
  case X86::SETOr:
  case X86::SETNOr:
  case X86::SETBr:
  case X86::SETAEr:
  case X86::SETEr:
  case X86::SETNEr:
  case X86::SETBEr:
  case X86::SETAr:
  case X86::SETSr:
  case X86::SETNSr:
  case X86::SETPr:
  case X86::SETNPr:
  case X86::SETLr:
  case X86::SETGEr:
  case X86::SETLEr:
  case X86::SETGr:
    return true;
  }
}

// We expect the instruction *immediately* before the setcc to imp-def
// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
// scheduling, look backwards until we hit the beginning of the
// basic-block, or a small bound (to avoid quadratic behavior).
MachineInstr *
X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
                                   MachineBasicBlock::reverse_iterator MI) {
  // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator?
  auto MBBStart = MBB->rend();
  for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
    for (auto &Op : MI->implicit_operands())
      if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
        return &*MI;

  return nullptr;
}

bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
  for (auto &Op : MI->implicit_operands())
    if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))
      return true;

  return false;
}

bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
  bool Changed = false;
  MRI = &MF.getRegInfo();
  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();

  SmallVector<MachineInstr*, 4> ToErase;

  for (auto &MBB : MF) {
    for (auto &MI : MBB) {
      // Find a setcc that is used by a zext.
      // This doesn't have to be the only use, the transformation is safe
      // regardless.
      if (!isSetCCr(MI.getOpcode()))
        continue;

      MachineInstr *ZExt = nullptr;
      for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
        if (Use.getOpcode() == X86::MOVZX32rr8)
          ZExt = &Use;

      if (!ZExt)
        continue;

      // Find the preceding instruction that imp-defs eflags.
      MachineInstr *FlagsDefMI = findFlagsImpDef(
          MI.getParent(), MachineBasicBlock::reverse_iterator(&MI));
      if (!FlagsDefMI)
        continue;

      // We'd like to put something that clobbers eflags directly before
      // FlagsDefMI. This can't hurt anything after FlagsDefMI, because
      // it, itself, by definition, clobbers eflags. But it may happen that
      // FlagsDefMI also *uses* eflags, in which case the transformation is
      // invalid.
      if (impUsesFlags(FlagsDefMI))
        continue;

      ++NumSubstZexts;
      Changed = true;

      // On 32-bit, we need to be careful to force an ABCD register.
      const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
                                          ? &X86::GR32RegClass
                                          : &X86::GR32_ABCDRegClass;
      unsigned ZeroReg = MRI->createVirtualRegister(RC);
      unsigned InsertReg = MRI->createVirtualRegister(RC);

      // Initialize a register with 0. This must go before the eflags def
      BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
              ZeroReg);

      // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
      // the setcc result into the low byte of the zeroed register.
      BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
              TII->get(X86::INSERT_SUBREG), InsertReg)
          .addReg(ZeroReg)
          .addReg(MI.getOperand(0).getReg())
          .addImm(X86::sub_8bit);
      MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);
      ToErase.push_back(ZExt);
    }
  }

  for (auto &I : ToErase)
    I->eraseFromParent();

  return Changed;
}
Recommit r274692 - [X86] Transform setcc + movzbl into xorl + setcc xorl + setcc is generally the preferred sequence due to the partial register stall setcc + movzbl suffers from. As a bonus, it also encodes one byte smaller. This fixes PR28146. The original commit tried inserting an 8bit-subreg into a GR32 (not GR32_ABCD) which was not appreciated by fast regalloc on 32-bit. llvm-svn: 274802 2016-07-08 00:50:23 +02:00			`//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This file defines a pass that fixes zero-extension of setcc patterns.`
			`// X86 setcc instructions are modeled to have no input arguments, and a single`
			`// GR8 output argument. This is consistent with other similar instructions`
			`// (e.g. movb), but means it is impossible to directly generate a setcc into`
			`// the lower GR8 of a specified GR32.`
			`// This means that ISel must select (zext (setcc)) into something like`
			`// seta %al; movzbl %al, %eax.`
			`// Unfortunately, this can cause a stall due to the partial register write`
			`// performed by the setcc. Instead, we can use:`
			`// xor %eax, %eax; seta %al`
			`// This both avoids the stall, and encodes shorter.`
			`//===----------------------------------------------------------------------===//`

			`#include "X86.h"`
			`#include "X86InstrInfo.h"`
			`#include "X86Subtarget.h"`
			`#include "llvm/ADT/Statistic.h"`
			`#include "llvm/CodeGen/MachineFunctionPass.h"`
			`#include "llvm/CodeGen/MachineInstrBuilder.h"`
			`#include "llvm/CodeGen/MachineRegisterInfo.h"`

			`using namespace llvm;`

			`#define DEBUG_TYPE "x86-fixup-setcc"`

			`STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");`

			`namespace {`
			`class X86FixupSetCCPass : public MachineFunctionPass {`
			`public:`
			`X86FixupSetCCPass() : MachineFunctionPass(ID) {}`

Use StringRef in Pass/PassManager APIs (NFC) llvm-svn: 283004 2016-10-01 04:56:57 +02:00			`StringRef getPassName() const override { return "X86 Fixup SetCC"; }`
Recommit r274692 - [X86] Transform setcc + movzbl into xorl + setcc xorl + setcc is generally the preferred sequence due to the partial register stall setcc + movzbl suffers from. As a bonus, it also encodes one byte smaller. This fixes PR28146. The original commit tried inserting an 8bit-subreg into a GR32 (not GR32_ABCD) which was not appreciated by fast regalloc on 32-bit. llvm-svn: 274802 2016-07-08 00:50:23 +02:00
			`bool runOnMachineFunction(MachineFunction &MF) override;`

			`private:`
			`// Find the preceding instruction that imp-defs eflags.`
			`MachineInstr findFlagsImpDef(MachineBasicBlock MBB,`
			`MachineBasicBlock::reverse_iterator MI);`

			`// Return true if MI imp-uses eflags.`
			`bool impUsesFlags(MachineInstr *MI);`

			`// Return true if this is the opcode of a SetCC instruction with a register`
			`// output.`
			`bool isSetCCr(unsigned Opode);`

			`MachineRegisterInfo *MRI;`
			`const X86InstrInfo *TII;`

			`enum { SearchBound = 16 };`

			`static char ID;`
			`};`

			`char X86FixupSetCCPass::ID = 0;`
			`}`

			`FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }`

			`bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {`
			`switch (Opcode) {`
			`default:`
			`return false;`
			`case X86::SETOr:`
			`case X86::SETNOr:`
			`case X86::SETBr:`
			`case X86::SETAEr:`
			`case X86::SETEr:`
			`case X86::SETNEr:`
			`case X86::SETBEr:`
			`case X86::SETAr:`
			`case X86::SETSr:`
			`case X86::SETNSr:`
			`case X86::SETPr:`
			`case X86::SETNPr:`
			`case X86::SETLr:`
			`case X86::SETGEr:`
			`case X86::SETLEr:`
			`case X86::SETGr:`
			`return true;`
			`}`
			`}`

			`// We expect the instruction immediately before the setcc to imp-def`
			`// EFLAGS (because of scheduling glue). To make this less brittle w.r.t`
			`// scheduling, look backwards until we hit the beginning of the`
			`// basic-block, or a small bound (to avoid quadratic behavior).`
			`MachineInstr *`
			`X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,`
			`MachineBasicBlock::reverse_iterator MI) {`
ADT: Give ilist<T>::reverse_iterator a handle to the current node Reverse iterators to doubly-linked lists can be simpler (and cheaper) than std::reverse_iterator. Make it so. In particular, change ilist<T>::reverse_iterator so that it is never invalidated unless the node it references is deleted. This matches the guarantees of ilist<T>::iterator. (Note: MachineBasicBlock::iterator is not an ilist iterator, but a MachineInstrBundleIterator<MachineInstr>. This commit does not change MachineBasicBlock::reverse_iterator, but it does update MachineBasicBlock::reverse_instr_iterator. See note at end of commit message for details on bundle iterators.) Given the list (with the Sentinel showing twice for simplicity): [Sentinel] <-> A <-> B <-> [Sentinel] the following is now true: 1. begin() represents A. 2. begin() holds the pointer for A. 3. end() represents [Sentinel]. 4. end() holds the poitner for [Sentinel]. 5. rbegin() represents B. 6. rbegin() holds the pointer for B. 7. rend() represents [Sentinel]. 8. rend() holds the pointer for [Sentinel]. The changes are #6 and #8. Here are some properties from the old scheme (which used std::reverse_iterator): - rbegin() held the pointer for [Sentinel] and rend() held the pointer for A; - operator() cost two dereferences instead of one; - converting from a valid iterator to its valid reverse_iterator involved a confusing increment; and - "RI++->erase()" left RI invalid. The unintuitive replacement was "RI->erase(), RE = end()". With vector-like data structures these properties are hard to avoid (since past-the-beginning is not a valid pointer), and don't impose a real cost (since there's still only one dereference, and all iterators are invalidated on erase). But with lists, this was a poor design. Specifically, the following code (which obviously works with normal iterators) now works with ilist::reverse_iterator as well: for (auto RI = L.rbegin(), RE = L.rend(); RI != RE;) fooThatMightRemoveArgFromList(RI++); Converting between iterator and reverse_iterator for the same node uses the getReverse() function. reverse_iterator iterator::getReverse(); iterator reverse_iterator::getReverse(); Why doesn't iterator <=> reverse_iterator conversion use constructors? In order to catch and update old code, reverse_iterator does not even have an explicit conversion from iterator. It wouldn't be safe because there would be no reasonable way to catch all the bugs from the changed semantic (see the changes at call sites that are part of this patch). Old code used this API: std::reverse_iterator::reverse_iterator(iterator); iterator std::reverse_iterator::base(); Here's how to update from old code to new (that incorporates the semantic change), assuming I is an ilist<>::iterator and RI is an ilist<>::reverse_iterator: [Old] ==> [New] reverse_iterator(I) (--I).getReverse() reverse_iterator(I) ++I.getReverse() --reverse_iterator(I) I.getReverse() reverse_iterator(++I) I.getReverse() RI.base() (--RI).getReverse() RI.base() ++RI.getReverse() --RI.base() RI.getReverse() (++RI).base() RI.getReverse() delete &RI, RE = end() delete &RI++ RI->erase(), RE = end() RI++->erase() ======================================= Note: bundle iterators are out of scope ======================================= MachineBasicBlock::iterator, also known as MachineInstrBundleIterator<MachineInstr>, is a wrapper to represent MachineInstr bundles. The idea is that each operator++ takes you to the beginning of the next bundle. Implementing a sane reverse iterator for this is harder than ilist. Here are the options: - Use std::reverse_iterator<MBB::i>. Store a handle to the beginning of the next bundle. A call to operator() runs a loop (usually operator--() will be called 1 time, for unbundled instructions). Increment/decrement just works. This is the status quo. - Store a handle to the final node in the bundle. A call to operator() still runs a loop, but it iterates one time fewer (usually operator--() will be called 0 times, for unbundled instructions). Increment/decrement just works. - Make the ilist_sentinel<MachineInstr> always store that it's the sentinel (instead of just in asserts mode). Then the bundle iterator can sniff the sentinel bit in operator++(). I initially tried implementing the end() option as part of this commit, but updating iterator/reverse_iterator conversion call sites was error-prone. I have a WIP series of patches that implements the final option. llvm-svn: 280032 2016-08-30 02:13:12 +02:00			`// FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator?`
			`auto MBBStart = MBB->rend();`
Recommit r274692 - [X86] Transform setcc + movzbl into xorl + setcc xorl + setcc is generally the preferred sequence due to the partial register stall setcc + movzbl suffers from. As a bonus, it also encodes one byte smaller. This fixes PR28146. The original commit tried inserting an 8bit-subreg into a GR32 (not GR32_ABCD) which was not appreciated by fast regalloc on 32-bit. llvm-svn: 274802 2016-07-08 00:50:23 +02:00			`for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)`
			`for (auto &Op : MI->implicit_operands())`
			`if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))`
			`return &*MI;`

			`return nullptr;`
			`}`

			`bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {`
			`for (auto &Op : MI->implicit_operands())`
			`if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))`
			`return true;`

			`return false;`
			`}`

			`bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {`
			`bool Changed = false;`
			`MRI = &MF.getRegInfo();`
			`TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();`

			`SmallVector<MachineInstr*, 4> ToErase;`

			`for (auto &MBB : MF) {`
			`for (auto &MI : MBB) {`
			`// Find a setcc that is used by a zext.`
			`// This doesn't have to be the only use, the transformation is safe`
			`// regardless.`
			`if (!isSetCCr(MI.getOpcode()))`
			`continue;`

			`MachineInstr *ZExt = nullptr;`
			`for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))`
			`if (Use.getOpcode() == X86::MOVZX32rr8)`
			`ZExt = &Use;`

			`if (!ZExt)`
			`continue;`

			`// Find the preceding instruction that imp-defs eflags.`
			`MachineInstr *FlagsDefMI = findFlagsImpDef(`
			`MI.getParent(), MachineBasicBlock::reverse_iterator(&MI));`
			`if (!FlagsDefMI)`
			`continue;`

			`// We'd like to put something that clobbers eflags directly before`
			`// FlagsDefMI. This can't hurt anything after FlagsDefMI, because`
			`// it, itself, by definition, clobbers eflags. But it may happen that`
			`// FlagsDefMI also uses eflags, in which case the transformation is`
			`// invalid.`
			`if (impUsesFlags(FlagsDefMI))`
			`continue;`

			`++NumSubstZexts;`
			`Changed = true;`

			`// On 32-bit, we need to be careful to force an ABCD register.`
			`const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()`
			`? &X86::GR32RegClass`
			`: &X86::GR32_ABCDRegClass;`
			`unsigned ZeroReg = MRI->createVirtualRegister(RC);`
			`unsigned InsertReg = MRI->createVirtualRegister(RC);`

			`// Initialize a register with 0. This must go before the eflags def`
			`BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),`
			`ZeroReg);`

			`// X86 setcc only takes an output GR8, so fake a GR32 input by inserting`
			`// the setcc result into the low byte of the zeroed register.`
			`BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),`
			`TII->get(X86::INSERT_SUBREG), InsertReg)`
			`.addReg(ZeroReg)`
			`.addReg(MI.getOperand(0).getReg())`
			`.addImm(X86::sub_8bit);`
			`MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);`
			`ToErase.push_back(ZExt);`
			`}`
			`}`

			`for (auto &I : ToErase)`
			`I->eraseFromParent();`

			`return Changed;`
			`}`