mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 13:11:39 +01:00
[X86MacroFusion] Handle branch fusion (AMD CPUs).
Summary: This adds a BranchFusion feature to replace the usage of the MacroFusion for AMD CPUs. See D59688 for context. Reviewers: andreadb, lebedev.ri Subscribers: hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59872 llvm-svn: 357171
This commit is contained in:
parent
6d24ff8d01
commit
6ea739f7f1
@ -344,6 +344,12 @@ def FeatureERMSB
|
||||
"ermsb", "HasERMSB", "true",
|
||||
"REP MOVS/STOS are fast">;
|
||||
|
||||
// Bulldozer and newer processors can merge CMP/TEST (but not other
|
||||
// instructions) with conditional branches.
|
||||
def FeatureBranchFusion
|
||||
: SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
|
||||
"CMP/TEST can be fused with conditional branches">;
|
||||
|
||||
// Sandy Bridge and newer processors have many instructions that can be
|
||||
// fused with conditional branches and pass through the CPU as a single
|
||||
// operation.
|
||||
@ -810,7 +816,7 @@ def ProcessorFeatures {
|
||||
FeatureSlowSHLD,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFast11ByteNOP,
|
||||
FeatureMacroFusion];
|
||||
FeatureBranchFusion];
|
||||
list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
|
||||
|
||||
// PileDriver
|
||||
@ -860,7 +866,7 @@ def ProcessorFeatures {
|
||||
FeatureLZCNT,
|
||||
FeatureFastBEXTR,
|
||||
FeatureFast15ByteNOP,
|
||||
FeatureMacroFusion,
|
||||
FeatureBranchFusion,
|
||||
FeatureMMX,
|
||||
FeatureMOVBE,
|
||||
FeatureMWAITX,
|
||||
|
@ -18,59 +18,29 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
/// Check if the instr pair, FirstMI and SecondMI, should be fused
|
||||
/// together. Given SecondMI, when FirstMI is unspecified, then check if
|
||||
/// SecondMI may be part of a fused pair at all.
|
||||
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
const TargetSubtargetInfo &TSI,
|
||||
const MachineInstr *FirstMI,
|
||||
const MachineInstr &SecondMI) {
|
||||
const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
|
||||
// Check if this processor supports macro-fusion.
|
||||
if (!ST.hasMacroFusion())
|
||||
return false;
|
||||
namespace {
|
||||
|
||||
enum {
|
||||
FuseTest,
|
||||
FuseCmp,
|
||||
FuseInc
|
||||
} FuseKind;
|
||||
// The classification for the first instruction.
|
||||
enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid };
|
||||
|
||||
unsigned FirstOpcode = FirstMI
|
||||
? FirstMI->getOpcode()
|
||||
: static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
|
||||
unsigned SecondOpcode = SecondMI.getOpcode();
|
||||
// The classification for the second instruction (jump).
|
||||
enum class JumpKind {
|
||||
// JE, JL, JG and variants.
|
||||
ELG,
|
||||
// JA, JB and variants.
|
||||
AB,
|
||||
// JS, JP, JO and variants.
|
||||
SPO,
|
||||
// Not a fusable jump.
|
||||
Invalid,
|
||||
};
|
||||
|
||||
switch (SecondOpcode) {
|
||||
} // namespace
|
||||
|
||||
static FirstInstrKind classifyFirst(const MachineInstr &MI) {
|
||||
switch (MI.getOpcode()) {
|
||||
default:
|
||||
return false;
|
||||
case X86::JE_1:
|
||||
case X86::JNE_1:
|
||||
case X86::JL_1:
|
||||
case X86::JLE_1:
|
||||
case X86::JG_1:
|
||||
case X86::JGE_1:
|
||||
FuseKind = FuseInc;
|
||||
break;
|
||||
case X86::JB_1:
|
||||
case X86::JBE_1:
|
||||
case X86::JA_1:
|
||||
case X86::JAE_1:
|
||||
FuseKind = FuseCmp;
|
||||
break;
|
||||
case X86::JS_1:
|
||||
case X86::JNS_1:
|
||||
case X86::JP_1:
|
||||
case X86::JNP_1:
|
||||
case X86::JO_1:
|
||||
case X86::JNO_1:
|
||||
FuseKind = FuseTest;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (FirstOpcode) {
|
||||
default:
|
||||
return false;
|
||||
return FirstInstrKind::Invalid;
|
||||
case X86::TEST8rr:
|
||||
case X86::TEST16rr:
|
||||
case X86::TEST32rr:
|
||||
@ -83,6 +53,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
case X86::TEST16mr:
|
||||
case X86::TEST32mr:
|
||||
case X86::TEST64mr:
|
||||
return FirstInstrKind::Test;
|
||||
case X86::AND16ri:
|
||||
case X86::AND16ri8:
|
||||
case X86::AND16rm:
|
||||
@ -98,7 +69,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
case X86::AND8ri:
|
||||
case X86::AND8rm:
|
||||
case X86::AND8rr:
|
||||
return true;
|
||||
return FirstInstrKind::And;
|
||||
case X86::CMP16ri:
|
||||
case X86::CMP16ri8:
|
||||
case X86::CMP16rm:
|
||||
@ -118,6 +89,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
case X86::CMP8rm:
|
||||
case X86::CMP8rr:
|
||||
case X86::CMP8mr:
|
||||
return FirstInstrKind::Cmp;
|
||||
case X86::ADD16ri:
|
||||
case X86::ADD16ri8:
|
||||
case X86::ADD16ri8_DB:
|
||||
@ -159,7 +131,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
case X86::SUB8ri:
|
||||
case X86::SUB8rm:
|
||||
case X86::SUB8rr:
|
||||
return FuseKind == FuseCmp || FuseKind == FuseInc;
|
||||
return FirstInstrKind::ALU;
|
||||
case X86::INC16r:
|
||||
case X86::INC32r:
|
||||
case X86::INC64r:
|
||||
@ -168,12 +140,85 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
case X86::DEC32r:
|
||||
case X86::DEC64r:
|
||||
case X86::DEC8r:
|
||||
return FuseKind == FuseInc;
|
||||
case X86::INSTRUCTION_LIST_END:
|
||||
return true;
|
||||
return FirstInstrKind::IncDec;
|
||||
}
|
||||
}
|
||||
|
||||
static JumpKind classifySecond(const MachineInstr &MI) {
|
||||
switch (MI.getOpcode()) {
|
||||
default:
|
||||
return JumpKind::Invalid;
|
||||
case X86::JE_1:
|
||||
case X86::JNE_1:
|
||||
case X86::JL_1:
|
||||
case X86::JLE_1:
|
||||
case X86::JG_1:
|
||||
case X86::JGE_1:
|
||||
return JumpKind::ELG;
|
||||
case X86::JB_1:
|
||||
case X86::JBE_1:
|
||||
case X86::JA_1:
|
||||
case X86::JAE_1:
|
||||
return JumpKind::AB;
|
||||
case X86::JS_1:
|
||||
case X86::JNS_1:
|
||||
case X86::JP_1:
|
||||
case X86::JNP_1:
|
||||
case X86::JO_1:
|
||||
case X86::JNO_1:
|
||||
return JumpKind::SPO;
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the instr pair, FirstMI and SecondMI, should be fused
|
||||
/// together. Given SecondMI, when FirstMI is unspecified, then check if
|
||||
/// SecondMI may be part of a fused pair at all.
|
||||
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
const TargetSubtargetInfo &TSI,
|
||||
const MachineInstr *FirstMI,
|
||||
const MachineInstr &SecondMI) {
|
||||
const X86Subtarget &ST = static_cast<const X86Subtarget &>(TSI);
|
||||
|
||||
// Check if this processor supports any kind of fusion.
|
||||
if (!(ST.hasBranchFusion() || ST.hasMacroFusion()))
|
||||
return false;
|
||||
|
||||
const JumpKind BranchKind = classifySecond(SecondMI);
|
||||
|
||||
if (BranchKind == JumpKind::Invalid)
|
||||
return false; // Second cannot be fused with anything.
|
||||
|
||||
if (FirstMI == nullptr)
|
||||
return true; // We're only checking whether Second can be fused at all.
|
||||
|
||||
const FirstInstrKind TestKind = classifyFirst(*FirstMI);
|
||||
|
||||
if (ST.hasBranchFusion()) {
|
||||
// Branch fusion can merge CMP and TEST with all conditional jumps.
|
||||
return (TestKind == FirstInstrKind::Cmp ||
|
||||
TestKind == FirstInstrKind::Test);
|
||||
}
|
||||
|
||||
if (ST.hasMacroFusion()) {
|
||||
// Macro Fusion rules are a bit more complex. See Agner Fog's
|
||||
// Microarchitecture table 9.2 "Instruction Fusion".
|
||||
switch (TestKind) {
|
||||
case FirstInstrKind::Test:
|
||||
case FirstInstrKind::And:
|
||||
return true;
|
||||
case FirstInstrKind::Cmp:
|
||||
case FirstInstrKind::ALU:
|
||||
return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB;
|
||||
case FirstInstrKind::IncDec:
|
||||
return BranchKind == JumpKind::ELG;
|
||||
case FirstInstrKind::Invalid:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown branch fusion type");
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
|
||||
std::unique_ptr<ScheduleDAGMutation>
|
||||
|
@ -297,6 +297,9 @@ protected:
|
||||
/// True if the processor supports macrofusion.
|
||||
bool HasMacroFusion = false;
|
||||
|
||||
/// True if the processor supports branch fusion.
|
||||
bool HasBranchFusion = false;
|
||||
|
||||
/// True if the processor has enhanced REP MOVSB/STOSB.
|
||||
bool HasERMSB = false;
|
||||
|
||||
@ -642,6 +645,7 @@ public:
|
||||
bool hasFastBEXTR() const { return HasFastBEXTR; }
|
||||
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
|
||||
bool hasMacroFusion() const { return HasMacroFusion; }
|
||||
bool hasBranchFusion() const { return HasBranchFusion; }
|
||||
bool hasERMSB() const { return HasERMSB; }
|
||||
bool hasSlowDivide32() const { return HasSlowDivide32; }
|
||||
bool hasSlowDivide64() const { return HasSlowDivide64; }
|
||||
|
@ -2984,7 +2984,7 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
|
||||
}
|
||||
|
||||
bool X86TTIImpl::canMacroFuseCmp() {
|
||||
return ST->hasMacroFusion();
|
||||
return ST->hasMacroFusion() || ST->hasBranchFusion();
|
||||
}
|
||||
|
||||
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
|
||||
|
@ -59,6 +59,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
|
||||
X86::FeatureLEAForSP,
|
||||
X86::FeatureLEAUsesAG,
|
||||
X86::FeatureLZCNTFalseDeps,
|
||||
X86::FeatureBranchFusion,
|
||||
X86::FeatureMacroFusion,
|
||||
X86::FeatureMergeToThreeWayBranch,
|
||||
X86::FeaturePadShortFunctions,
|
||||
|
@ -1,6 +1,7 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion | FileCheck %s --check-prefix=NOFUSION
|
||||
; RUN: llc < %s -mtriple=x86_64-- -mattr=+macrofusion | FileCheck %s --check-prefix=MACROFUSION
|
||||
; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion,-branchfusion | FileCheck %s --check-prefix=NOFUSION
|
||||
; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion,+branchfusion | FileCheck %s --check-prefix=BRANCHFUSION --check-prefix=BRANCHFUSIONONLY
|
||||
; RUN: llc < %s -mtriple=x86_64-- -mattr=+macrofusion,-branchfusion | FileCheck %s --check-prefix=BRANCHFUSION --check-prefix=MACROFUSION
|
||||
|
||||
; testb should be scheduled right before je to enable macro-fusion.
|
||||
|
||||
@ -16,16 +17,16 @@ define i32 @macrofuse_test_je(i32 %flags, i8* %p) nounwind {
|
||||
; NOFUSION-NEXT: .LBB0_2: # %if.end
|
||||
; NOFUSION-NEXT: retq
|
||||
;
|
||||
; MACROFUSION-LABEL: macrofuse_test_je:
|
||||
; MACROFUSION: # %bb.0: # %entry
|
||||
; MACROFUSION-NEXT: xorl %eax, %eax
|
||||
; MACROFUSION-NEXT: movb $1, (%rsi)
|
||||
; MACROFUSION-NEXT: testl $512, %edi # imm = 0x200
|
||||
; MACROFUSION-NEXT: je .LBB0_2
|
||||
; MACROFUSION-NEXT: # %bb.1: # %if.then
|
||||
; MACROFUSION-NEXT: movl $1, %eax
|
||||
; MACROFUSION-NEXT: .LBB0_2: # %if.end
|
||||
; MACROFUSION-NEXT: retq
|
||||
; BRANCHFUSION-LABEL: macrofuse_test_je:
|
||||
; BRANCHFUSION: # %bb.0: # %entry
|
||||
; BRANCHFUSION-NEXT: xorl %eax, %eax
|
||||
; BRANCHFUSION-NEXT: movb $1, (%rsi)
|
||||
; BRANCHFUSION-NEXT: testl $512, %edi # imm = 0x200
|
||||
; BRANCHFUSION-NEXT: je .LBB0_2
|
||||
; BRANCHFUSION-NEXT: # %bb.1: # %if.then
|
||||
; BRANCHFUSION-NEXT: movl $1, %eax
|
||||
; BRANCHFUSION-NEXT: .LBB0_2: # %if.end
|
||||
; BRANCHFUSION-NEXT: retq
|
||||
entry:
|
||||
%and = and i32 %flags, 512
|
||||
%tobool = icmp eq i32 %and, 0
|
||||
@ -53,17 +54,17 @@ define i32 @macrofuse_cmp_je(i32 %flags, i8* %p) nounwind {
|
||||
; NOFUSION-NEXT: xorl %eax, %eax
|
||||
; NOFUSION-NEXT: retq
|
||||
;
|
||||
; MACROFUSION-LABEL: macrofuse_cmp_je:
|
||||
; MACROFUSION: # %bb.0: # %entry
|
||||
; MACROFUSION-NEXT: movb $1, (%rsi)
|
||||
; MACROFUSION-NEXT: cmpl $512, %edi # imm = 0x200
|
||||
; MACROFUSION-NEXT: je .LBB1_1
|
||||
; MACROFUSION-NEXT: # %bb.2: # %if.then
|
||||
; MACROFUSION-NEXT: movl $1, %eax
|
||||
; MACROFUSION-NEXT: retq
|
||||
; MACROFUSION-NEXT: .LBB1_1:
|
||||
; MACROFUSION-NEXT: xorl %eax, %eax
|
||||
; MACROFUSION-NEXT: retq
|
||||
; BRANCHFUSION-LABEL: macrofuse_cmp_je:
|
||||
; BRANCHFUSION: # %bb.0: # %entry
|
||||
; BRANCHFUSION-NEXT: movb $1, (%rsi)
|
||||
; BRANCHFUSION-NEXT: cmpl $512, %edi # imm = 0x200
|
||||
; BRANCHFUSION-NEXT: je .LBB1_1
|
||||
; BRANCHFUSION-NEXT: # %bb.2: # %if.then
|
||||
; BRANCHFUSION-NEXT: movl $1, %eax
|
||||
; BRANCHFUSION-NEXT: retq
|
||||
; BRANCHFUSION-NEXT: .LBB1_1:
|
||||
; BRANCHFUSION-NEXT: xorl %eax, %eax
|
||||
; BRANCHFUSION-NEXT: retq
|
||||
entry:
|
||||
%sub = sub i32 %flags, 512
|
||||
%tobool = icmp eq i32 %sub, 0
|
||||
@ -90,6 +91,17 @@ define i32 @macrofuse_alu_je(i32 %flags, i8* %p) nounwind {
|
||||
; NOFUSION-NEXT: .LBB2_2: # %if.end
|
||||
; NOFUSION-NEXT: retq
|
||||
;
|
||||
; BRANCHFUSIONONLY-LABEL: macrofuse_alu_je:
|
||||
; BRANCHFUSIONONLY: # %bb.0: # %entry
|
||||
; BRANCHFUSIONONLY-NEXT: movl %edi, %eax
|
||||
; BRANCHFUSIONONLY-NEXT: addl $-512, %eax # imm = 0xFE00
|
||||
; BRANCHFUSIONONLY-NEXT: movb $1, (%rsi)
|
||||
; BRANCHFUSIONONLY-NEXT: je .LBB2_2
|
||||
; BRANCHFUSIONONLY-NEXT: # %bb.1: # %if.then
|
||||
; BRANCHFUSIONONLY-NEXT: movl $1, %eax
|
||||
; BRANCHFUSIONONLY-NEXT: .LBB2_2: # %if.end
|
||||
; BRANCHFUSIONONLY-NEXT: retq
|
||||
;
|
||||
; MACROFUSION-LABEL: macrofuse_alu_je:
|
||||
; MACROFUSION: # %bb.0: # %entry
|
||||
; MACROFUSION-NEXT: movl %edi, %eax
|
||||
@ -126,6 +138,17 @@ define i32 @macrofuse_dec_je(i32 %flags, i8* %p) nounwind {
|
||||
; NOFUSION-NEXT: .LBB3_2: # %if.end
|
||||
; NOFUSION-NEXT: retq
|
||||
;
|
||||
; BRANCHFUSIONONLY-LABEL: macrofuse_dec_je:
|
||||
; BRANCHFUSIONONLY: # %bb.0: # %entry
|
||||
; BRANCHFUSIONONLY-NEXT: movl %edi, %eax
|
||||
; BRANCHFUSIONONLY-NEXT: decl %eax
|
||||
; BRANCHFUSIONONLY-NEXT: movb $1, (%rsi)
|
||||
; BRANCHFUSIONONLY-NEXT: je .LBB3_2
|
||||
; BRANCHFUSIONONLY-NEXT: # %bb.1: # %if.then
|
||||
; BRANCHFUSIONONLY-NEXT: movl $1, %eax
|
||||
; BRANCHFUSIONONLY-NEXT: .LBB3_2: # %if.end
|
||||
; BRANCHFUSIONONLY-NEXT: retq
|
||||
;
|
||||
; MACROFUSION-LABEL: macrofuse_dec_je:
|
||||
; MACROFUSION: # %bb.0: # %entry
|
||||
; MACROFUSION-NEXT: movl %edi, %eax
|
||||
|
@ -3,8 +3,9 @@
|
||||
; RUN: opt < %s -loop-reduce -mcpu=bdver2 -S | FileCheck %s --check-prefix=BUL
|
||||
; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW
|
||||
|
||||
; RUN: llc < %s | FileCheck %s --check-prefix=BASE
|
||||
; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE
|
||||
; RUN: llc < %s | FileCheck %s --check-prefix=BASE
|
||||
; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE
|
||||
; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-unknown"
|
||||
|
Loading…
x
Reference in New Issue
Block a user