mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
AMDGPU/GlobalISel: Introduce post-legalize combiner
The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner.
This commit is contained in:
parent
eba472875b
commit
b754a8cb27
@ -30,6 +30,8 @@ class Module;
|
|||||||
// GlobalISel passes
|
// GlobalISel passes
|
||||||
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
|
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
|
||||||
FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone);
|
FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone);
|
||||||
|
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &);
|
||||||
|
FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
|
||||||
|
|
||||||
// R600 Passes
|
// R600 Passes
|
||||||
FunctionPass *createR600VectorRegMerger();
|
FunctionPass *createR600VectorRegMerger();
|
||||||
|
@ -26,7 +26,12 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
|
|||||||
|
|
||||||
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
|
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
|
||||||
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
|
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
|
||||||
elide_br_by_inverting_cond,
|
elide_br_by_inverting_cond]> {
|
||||||
gfx6gfx7_combines]> {
|
|
||||||
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
|
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
|
||||||
|
"AMDGPUGenPostLegalizerCombinerHelper", [all_combines,
|
||||||
|
gfx6gfx7_combines]> {
|
||||||
|
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
|
||||||
|
}
|
||||||
|
261
lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
Normal file
261
lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
Normal file
@ -0,0 +1,261 @@
|
|||||||
|
//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// This pass does combining of machine instructions at the generic MI level,
|
||||||
|
// after the legalizer.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "AMDGPUTargetMachine.h"
|
||||||
|
#include "AMDGPULegalizerInfo.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/Combiner.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
||||||
|
#include "llvm/CodeGen/MachineDominators.h"
|
||||||
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||||
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||||
|
#include "llvm/Support/Debug.h"
|
||||||
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
using namespace MIPatternMatch;
|
||||||
|
|
||||||
|
struct FMinFMaxLegacyInfo {
|
||||||
|
Register LHS;
|
||||||
|
Register RHS;
|
||||||
|
Register True;
|
||||||
|
Register False;
|
||||||
|
CmpInst::Predicate Pred;
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
|
||||||
|
static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||||
|
MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
|
||||||
|
// FIXME: Combines should have subtarget predicates, and we shouldn't need
|
||||||
|
// this here.
|
||||||
|
if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// FIXME: Type predicate on pattern
|
||||||
|
if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Register Cond = MI.getOperand(1).getReg();
|
||||||
|
if (!MRI.hasOneNonDBGUse(Cond) ||
|
||||||
|
!mi_match(Cond, MRI,
|
||||||
|
m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Info.True = MI.getOperand(2).getReg();
|
||||||
|
Info.False = MI.getOperand(3).getReg();
|
||||||
|
|
||||||
|
if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
|
||||||
|
!(Info.LHS == Info.False && Info.RHS == Info.True))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
switch (Info.Pred) {
|
||||||
|
case CmpInst::FCMP_FALSE:
|
||||||
|
case CmpInst::FCMP_OEQ:
|
||||||
|
case CmpInst::FCMP_ONE:
|
||||||
|
case CmpInst::FCMP_ORD:
|
||||||
|
case CmpInst::FCMP_UNO:
|
||||||
|
case CmpInst::FCMP_UEQ:
|
||||||
|
case CmpInst::FCMP_UNE:
|
||||||
|
case CmpInst::FCMP_TRUE:
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
|
||||||
|
const FMinFMaxLegacyInfo &Info) {
|
||||||
|
|
||||||
|
auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
|
||||||
|
MachineIRBuilder MIB(MI);
|
||||||
|
MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
|
||||||
|
};
|
||||||
|
|
||||||
|
switch (Info.Pred) {
|
||||||
|
case CmpInst::FCMP_ULT:
|
||||||
|
case CmpInst::FCMP_ULE:
|
||||||
|
if (Info.LHS == Info.True)
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
|
||||||
|
else
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
|
||||||
|
break;
|
||||||
|
case CmpInst::FCMP_OLE:
|
||||||
|
case CmpInst::FCMP_OLT: {
|
||||||
|
// We need to permute the operands to get the correct NaN behavior. The
|
||||||
|
// selected operand is the second one based on the failing compare with NaN,
|
||||||
|
// so permute it based on the compare type the hardware uses.
|
||||||
|
if (Info.LHS == Info.True)
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
|
||||||
|
else
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case CmpInst::FCMP_UGE:
|
||||||
|
case CmpInst::FCMP_UGT: {
|
||||||
|
if (Info.LHS == Info.True)
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
|
||||||
|
else
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case CmpInst::FCMP_OGT:
|
||||||
|
case CmpInst::FCMP_OGE: {
|
||||||
|
if (Info.LHS == Info.True)
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
|
||||||
|
else
|
||||||
|
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
llvm_unreachable("predicate should not have matched");
|
||||||
|
}
|
||||||
|
|
||||||
|
MI.eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
|
#include "AMDGPUGenPostLegalizeGICombiner.inc"
|
||||||
|
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
|
||||||
|
#include "AMDGPUGenPostLegalizeGICombiner.inc"
|
||||||
|
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
|
||||||
|
|
||||||
|
class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
|
||||||
|
GISelKnownBits *KB;
|
||||||
|
MachineDominatorTree *MDT;
|
||||||
|
|
||||||
|
public:
|
||||||
|
AMDGPUGenPostLegalizerCombinerHelper Generated;
|
||||||
|
|
||||||
|
AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
|
||||||
|
const AMDGPULegalizerInfo *LI,
|
||||||
|
GISelKnownBits *KB, MachineDominatorTree *MDT)
|
||||||
|
: CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
|
||||||
|
/*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
|
||||||
|
KB(KB), MDT(MDT) {
|
||||||
|
if (!Generated.parseCommandLineOption())
|
||||||
|
report_fatal_error("Invalid rule identifier");
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
|
||||||
|
MachineIRBuilder &B) const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
|
||||||
|
MachineInstr &MI,
|
||||||
|
MachineIRBuilder &B) const {
|
||||||
|
CombinerHelper Helper(Observer, B, KB, MDT);
|
||||||
|
|
||||||
|
if (Generated.tryCombineAll(Observer, MI, B, Helper))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
switch (MI.getOpcode()) {
|
||||||
|
case TargetOpcode::G_SHL:
|
||||||
|
case TargetOpcode::G_LSHR:
|
||||||
|
case TargetOpcode::G_ASHR:
|
||||||
|
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
|
||||||
|
// common case, splitting this into a move and a 32-bit shift is faster and
|
||||||
|
// the same code size.
|
||||||
|
return Helper.tryCombineShiftToUnmerge(MI, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
|
||||||
|
#include "AMDGPUGenPostLegalizeGICombiner.inc"
|
||||||
|
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
|
||||||
|
|
||||||
|
// Pass boilerplate
|
||||||
|
// ================
|
||||||
|
|
||||||
|
class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
|
||||||
|
public:
|
||||||
|
static char ID;
|
||||||
|
|
||||||
|
AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
|
||||||
|
|
||||||
|
StringRef getPassName() const override {
|
||||||
|
return "AMDGPUPostLegalizerCombiner";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||||
|
|
||||||
|
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
||||||
|
private:
|
||||||
|
bool IsOptNone;
|
||||||
|
};
|
||||||
|
} // end anonymous namespace
|
||||||
|
|
||||||
|
void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||||
|
AU.addRequired<TargetPassConfig>();
|
||||||
|
AU.setPreservesCFG();
|
||||||
|
getSelectionDAGFallbackAnalysisUsage(AU);
|
||||||
|
AU.addRequired<GISelKnownBitsAnalysis>();
|
||||||
|
AU.addPreserved<GISelKnownBitsAnalysis>();
|
||||||
|
if (!IsOptNone) {
|
||||||
|
AU.addRequired<MachineDominatorTree>();
|
||||||
|
AU.addPreserved<MachineDominatorTree>();
|
||||||
|
}
|
||||||
|
MachineFunctionPass::getAnalysisUsage(AU);
|
||||||
|
}
|
||||||
|
|
||||||
|
AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
|
||||||
|
: MachineFunctionPass(ID), IsOptNone(IsOptNone) {
|
||||||
|
initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
|
||||||
|
if (MF.getProperties().hasProperty(
|
||||||
|
MachineFunctionProperties::Property::FailedISel))
|
||||||
|
return false;
|
||||||
|
auto *TPC = &getAnalysis<TargetPassConfig>();
|
||||||
|
const Function &F = MF.getFunction();
|
||||||
|
bool EnableOpt =
|
||||||
|
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
|
||||||
|
|
||||||
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||||
|
const AMDGPULegalizerInfo *LI
|
||||||
|
= static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
|
||||||
|
|
||||||
|
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
|
||||||
|
MachineDominatorTree *MDT =
|
||||||
|
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
|
||||||
|
AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
|
||||||
|
F.hasMinSize(), LI, KB, MDT);
|
||||||
|
Combiner C(PCInfo, TPC);
|
||||||
|
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
char AMDGPUPostLegalizerCombiner::ID = 0;
|
||||||
|
INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
|
||||||
|
"Combine AMDGPU machine instrs after legalization",
|
||||||
|
false, false)
|
||||||
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
||||||
|
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
|
||||||
|
INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
|
||||||
|
"Combine AMDGPU machine instrs after legalization", false,
|
||||||
|
false)
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
|
||||||
|
return new AMDGPUPostLegalizerCombiner(IsOptNone);
|
||||||
|
}
|
||||||
|
} // end namespace llvm
|
@ -28,112 +28,13 @@
|
|||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
using namespace MIPatternMatch;
|
using namespace MIPatternMatch;
|
||||||
|
|
||||||
struct FMinFMaxLegacyInfo {
|
|
||||||
Register LHS;
|
|
||||||
Register RHS;
|
|
||||||
Register True;
|
|
||||||
Register False;
|
|
||||||
CmpInst::Predicate Pred;
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
|
|
||||||
static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
||||||
MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
|
|
||||||
// FIXME: Combines should have subtarget predicates, and we shouldn't need
|
|
||||||
// this here.
|
|
||||||
if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// FIXME: Type predicate on pattern
|
|
||||||
if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
Register Cond = MI.getOperand(1).getReg();
|
|
||||||
if (!MRI.hasOneNonDBGUse(Cond) ||
|
|
||||||
!mi_match(Cond, MRI,
|
|
||||||
m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
Info.True = MI.getOperand(2).getReg();
|
|
||||||
Info.False = MI.getOperand(3).getReg();
|
|
||||||
|
|
||||||
if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
|
|
||||||
!(Info.LHS == Info.False && Info.RHS == Info.True))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
switch (Info.Pred) {
|
|
||||||
case CmpInst::FCMP_FALSE:
|
|
||||||
case CmpInst::FCMP_OEQ:
|
|
||||||
case CmpInst::FCMP_ONE:
|
|
||||||
case CmpInst::FCMP_ORD:
|
|
||||||
case CmpInst::FCMP_UNO:
|
|
||||||
case CmpInst::FCMP_UEQ:
|
|
||||||
case CmpInst::FCMP_UNE:
|
|
||||||
case CmpInst::FCMP_TRUE:
|
|
||||||
return false;
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
|
|
||||||
const FMinFMaxLegacyInfo &Info) {
|
|
||||||
|
|
||||||
auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
|
|
||||||
MachineIRBuilder MIB(MI);
|
|
||||||
MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
|
|
||||||
};
|
|
||||||
|
|
||||||
switch (Info.Pred) {
|
|
||||||
case CmpInst::FCMP_ULT:
|
|
||||||
case CmpInst::FCMP_ULE:
|
|
||||||
if (Info.LHS == Info.True)
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
|
|
||||||
else
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
|
|
||||||
break;
|
|
||||||
case CmpInst::FCMP_OLE:
|
|
||||||
case CmpInst::FCMP_OLT: {
|
|
||||||
// We need to permute the operands to get the correct NaN behavior. The
|
|
||||||
// selected operand is the second one based on the failing compare with NaN,
|
|
||||||
// so permute it based on the compare type the hardware uses.
|
|
||||||
if (Info.LHS == Info.True)
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
|
|
||||||
else
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case CmpInst::FCMP_UGE:
|
|
||||||
case CmpInst::FCMP_UGT: {
|
|
||||||
if (Info.LHS == Info.True)
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
|
|
||||||
else
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case CmpInst::FCMP_OGT:
|
|
||||||
case CmpInst::FCMP_OGE: {
|
|
||||||
if (Info.LHS == Info.True)
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
|
|
||||||
else
|
|
||||||
buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
llvm_unreachable("predicate should not have matched");
|
|
||||||
}
|
|
||||||
|
|
||||||
MI.eraseFromParent();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
#include "AMDGPUGenGICombiner.inc"
|
#include "AMDGPUGenPreLegalizeGICombiner.inc"
|
||||||
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
|
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
|
||||||
#include "AMDGPUGenGICombiner.inc"
|
#include "AMDGPUGenPreLegalizeGICombiner.inc"
|
||||||
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
|
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
|
||||||
|
|
||||||
class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
|
class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
|
||||||
@ -165,13 +66,6 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
|
|||||||
return true;
|
return true;
|
||||||
|
|
||||||
switch (MI.getOpcode()) {
|
switch (MI.getOpcode()) {
|
||||||
case TargetOpcode::G_SHL:
|
|
||||||
case TargetOpcode::G_LSHR:
|
|
||||||
case TargetOpcode::G_ASHR:
|
|
||||||
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
|
|
||||||
// common case, splitting this into a move and a 32-bit shift is faster and
|
|
||||||
// the same code size.
|
|
||||||
return Helper.tryCombineShiftToUnmerge(MI, 32);
|
|
||||||
case TargetOpcode::G_CONCAT_VECTORS:
|
case TargetOpcode::G_CONCAT_VECTORS:
|
||||||
return Helper.tryCombineConcatVectors(MI);
|
return Helper.tryCombineConcatVectors(MI);
|
||||||
case TargetOpcode::G_SHUFFLE_VECTOR:
|
case TargetOpcode::G_SHUFFLE_VECTOR:
|
||||||
@ -182,7 +76,7 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
|
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
|
||||||
#include "AMDGPUGenGICombiner.inc"
|
#include "AMDGPUGenPreLegalizeGICombiner.inc"
|
||||||
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
|
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
|
||||||
|
|
||||||
// Pass boilerplate
|
// Pass boilerplate
|
||||||
@ -194,7 +88,9 @@ public:
|
|||||||
|
|
||||||
AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
|
AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
|
||||||
|
|
||||||
StringRef getPassName() const override { return "AMDGPUPreLegalizerCombiner"; }
|
StringRef getPassName() const override {
|
||||||
|
return "AMDGPUPreLegalizerCombiner";
|
||||||
|
}
|
||||||
|
|
||||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||||
|
|
||||||
|
@ -218,6 +218,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
|||||||
initializeAMDGPULowerKernelAttributesPass(*PR);
|
initializeAMDGPULowerKernelAttributesPass(*PR);
|
||||||
initializeAMDGPULowerIntrinsicsPass(*PR);
|
initializeAMDGPULowerIntrinsicsPass(*PR);
|
||||||
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
|
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
|
||||||
|
initializeAMDGPUPostLegalizerCombinerPass(*PR);
|
||||||
initializeAMDGPUPreLegalizerCombinerPass(*PR);
|
initializeAMDGPUPreLegalizerCombinerPass(*PR);
|
||||||
initializeAMDGPUPromoteAllocaPass(*PR);
|
initializeAMDGPUPromoteAllocaPass(*PR);
|
||||||
initializeAMDGPUCodeGenPreparePass(*PR);
|
initializeAMDGPUCodeGenPreparePass(*PR);
|
||||||
@ -623,6 +624,7 @@ public:
|
|||||||
bool addIRTranslator() override;
|
bool addIRTranslator() override;
|
||||||
void addPreLegalizeMachineIR() override;
|
void addPreLegalizeMachineIR() override;
|
||||||
bool addLegalizeMachineIR() override;
|
bool addLegalizeMachineIR() override;
|
||||||
|
void addPreRegBankSelect() override;
|
||||||
bool addRegBankSelect() override;
|
bool addRegBankSelect() override;
|
||||||
bool addGlobalInstructionSelect() override;
|
bool addGlobalInstructionSelect() override;
|
||||||
void addFastRegAlloc() override;
|
void addFastRegAlloc() override;
|
||||||
@ -911,6 +913,11 @@ bool GCNPassConfig::addLegalizeMachineIR() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GCNPassConfig::addPreRegBankSelect() {
|
||||||
|
bool IsOptNone = getOptLevel() == CodeGenOpt::None;
|
||||||
|
addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
|
||||||
|
}
|
||||||
|
|
||||||
bool GCNPassConfig::addRegBankSelect() {
|
bool GCNPassConfig::addRegBankSelect() {
|
||||||
addPass(new RegBankSelect());
|
addPass(new RegBankSelect());
|
||||||
return false;
|
return false;
|
||||||
|
@ -15,8 +15,10 @@ tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
|
|||||||
|
|
||||||
set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td)
|
set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td)
|
||||||
tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel)
|
tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel)
|
||||||
tablegen(LLVM AMDGPUGenGICombiner.inc -gen-global-isel-combiner
|
tablegen(LLVM AMDGPUGenPreLegalizeGICombiner.inc -gen-global-isel-combiner
|
||||||
-combiners="AMDGPUPreLegalizerCombinerHelper")
|
-combiners="AMDGPUPreLegalizerCombinerHelper")
|
||||||
|
tablegen(LLVM AMDGPUGenPostLegalizeGICombiner.inc -gen-global-isel-combiner
|
||||||
|
-combiners="AMDGPUPostLegalizerCombinerHelper")
|
||||||
|
|
||||||
set(LLVM_TARGET_DEFINITIONS R600.td)
|
set(LLVM_TARGET_DEFINITIONS R600.td)
|
||||||
tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
|
tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
|
||||||
@ -60,6 +62,7 @@ add_llvm_target(AMDGPUCodeGen
|
|||||||
AMDGPUMacroFusion.cpp
|
AMDGPUMacroFusion.cpp
|
||||||
AMDGPUMCInstLower.cpp
|
AMDGPUMCInstLower.cpp
|
||||||
AMDGPUOpenCLEnqueuedBlockLowering.cpp
|
AMDGPUOpenCLEnqueuedBlockLowering.cpp
|
||||||
|
AMDGPUPostLegalizerCombiner.cpp
|
||||||
AMDGPUPreLegalizerCombiner.cpp
|
AMDGPUPreLegalizerCombiner.cpp
|
||||||
AMDGPUPromoteAlloca.cpp
|
AMDGPUPromoteAlloca.cpp
|
||||||
AMDGPUPropagateAttributes.cpp
|
AMDGPUPropagateAttributes.cpp
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
|
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
|
||||||
|
|
||||||
---
|
---
|
||||||
name: narrow_ashr_s64_32_s64amt
|
name: narrow_ashr_s64_32_s64amt
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
|
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
|
||||||
|
|
||||||
---
|
---
|
||||||
name: narrow_lshr_s64_32_s64amt
|
name: narrow_lshr_s64_32_s64amt
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
|
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -253,3 +253,24 @@ define double @v_test_fmax_legacy_ult_f64(double %a, double %b) {
|
|||||||
%val = select i1 %cmp, double %b, double %a
|
%val = select i1 %cmp, double %b, double %a
|
||||||
ret double %val
|
ret double %val
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <2 x float> @v_test_fmax_legacy_ogt_v2f32(<2 x float> %a, <2 x float> %b) {
|
||||||
|
; GFX6-LABEL: v_test_fmax_legacy_ogt_v2f32:
|
||||||
|
; GFX6: ; %bb.0:
|
||||||
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; GFX6-NEXT: v_max_legacy_f32_e32 v0, v0, v2
|
||||||
|
; GFX6-NEXT: v_max_legacy_f32_e32 v1, v1, v3
|
||||||
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
;
|
||||||
|
; GFX8-LABEL: v_test_fmax_legacy_ogt_v2f32:
|
||||||
|
; GFX8: ; %bb.0:
|
||||||
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2
|
||||||
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
||||||
|
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
|
||||||
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
||||||
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%cmp = fcmp ogt <2 x float> %a, %b
|
||||||
|
%val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
|
||||||
|
ret <2 x float> %val
|
||||||
|
}
|
||||||
|
@ -382,3 +382,24 @@ define float @v_test_fcmp_select_false(float %a, float %b) {
|
|||||||
%val = select i1 %cmp, float %a, float %b
|
%val = select i1 %cmp, float %a, float %b
|
||||||
ret float %val
|
ret float %val
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <2 x float> @v_test_fmin_legacy_ole_v2f32(<2 x float> %a, <2 x float> %b) {
|
||||||
|
; GFX6-LABEL: v_test_fmin_legacy_ole_v2f32:
|
||||||
|
; GFX6: ; %bb.0:
|
||||||
|
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; GFX6-NEXT: v_min_legacy_f32_e32 v0, v0, v2
|
||||||
|
; GFX6-NEXT: v_min_legacy_f32_e32 v1, v1, v3
|
||||||
|
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
;
|
||||||
|
; GFX8-LABEL: v_test_fmin_legacy_ole_v2f32:
|
||||||
|
; GFX8: ; %bb.0:
|
||||||
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v2
|
||||||
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
||||||
|
; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v1, v3
|
||||||
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
||||||
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%cmp = fcmp ole <2 x float> %a, %b
|
||||||
|
%val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
|
||||||
|
ret <2 x float> %val
|
||||||
|
}
|
||||||
|
@ -26,9 +26,7 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
|
|||||||
|
|
||||||
; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
|
; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
|
||||||
|
|
||||||
; OS-UNKNOWN: s_add_u32 s[[LO:[0-9]+]], s0, 44
|
; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x15
|
||||||
; OS-UNKNOWN-NEXT: s_addc_u32 s[[HI:[0-9]+]], s1, 0
|
|
||||||
; OS-UNKNOWN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO]]:[[HI]]{{\]}}, 0xa
|
|
||||||
define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
|
define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
|
||||||
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||||
%header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
|
%header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
|
||||||
|
@ -286,30 +286,31 @@ define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
|
|||||||
; CHECK-LABEL: v_udiv_i32_pow2k_denom:
|
; CHECK-LABEL: v_udiv_i32_pow2k_denom:
|
||||||
; CHECK: ; %bb.0:
|
; CHECK: ; %bb.0:
|
||||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; CHECK-NEXT: s_movk_i32 s6, 0x1000
|
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s6
|
; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000
|
||||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4
|
||||||
; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
|
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
|
; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6
|
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v3, v1, s6
|
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
|
; CHECK-NEXT: v_mul_hi_u32 v4, v2, s4
|
||||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
|
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
|
||||||
|
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
|
||||||
|
; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
|
||||||
|
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v2
|
||||||
|
; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v2, v3
|
||||||
|
; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3
|
||||||
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v2, v2, v1
|
; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0
|
||||||
; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2
|
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
|
||||||
; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2
|
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
|
||||||
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, 1, v2
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v1, v1, v0
|
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v3
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6
|
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
|
||||||
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1
|
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
|
||||||
; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, 1, v1
|
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
|
|
||||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
|
|
||||||
; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v5
|
|
||||||
; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5]
|
; CHECK-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5]
|
||||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
||||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
%result = udiv i32 %num, 4096
|
%result = udiv i32 %num, 4096
|
||||||
ret i32 %result
|
ret i32 %result
|
||||||
@ -319,9 +320,9 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
|
|||||||
; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
|
; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
|
||||||
; CHECK: ; %bb.0:
|
; CHECK: ; %bb.0:
|
||||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; CHECK-NEXT: s_movk_i32 s8, 0x1000
|
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000
|
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000
|
||||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s8
|
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4
|
||||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
|
; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
|
||||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
|
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
|
||||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
|
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
|
||||||
@ -329,9 +330,9 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
|
|||||||
; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
|
; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
|
||||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
|
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
|
||||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
|
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8
|
; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v6, v3, s8
|
; CHECK-NEXT: v_mul_hi_u32 v6, v3, s4
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2
|
; CHECK-NEXT: v_lshlrev_b32_e32 v7, 12, v4
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2
|
; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
|
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
|
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
|
||||||
@ -349,17 +350,17 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
|
|||||||
; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
|
; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0
|
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1
|
; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8
|
; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3
|
||||||
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3
|
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3
|
||||||
; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 1, v3
|
; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 1, v3
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v8, v4, v2
|
; CHECK-NEXT: v_lshlrev_b32_e32 v8, 12, v4
|
||||||
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v4
|
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v4
|
||||||
; CHECK-NEXT: v_subrev_i32_e32 v10, vcc, 1, v4
|
; CHECK-NEXT: v_subrev_i32_e32 v10, vcc, 1, v4
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v0, v5
|
; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v0, v5
|
||||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
|
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
|
||||||
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v1, v8
|
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v1, v8
|
||||||
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8
|
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8
|
||||||
; CHECK-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v11
|
; CHECK-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v2
|
||||||
; CHECK-NEXT: v_cmp_ge_u32_e64 s[8:9], v0, v2
|
; CHECK-NEXT: v_cmp_ge_u32_e64 s[8:9], v0, v2
|
||||||
; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], vcc
|
; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], vcc
|
||||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, v6, v3, s[6:7]
|
; CHECK-NEXT: v_cndmask_b32_e64 v0, v6, v3, s[6:7]
|
||||||
|
@ -286,14 +286,14 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
|
|||||||
; CHECK-LABEL: v_urem_i32_pow2k_denom:
|
; CHECK-LABEL: v_urem_i32_pow2k_denom:
|
||||||
; CHECK: ; %bb.0:
|
; CHECK: ; %bb.0:
|
||||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; CHECK-NEXT: s_movk_i32 s6, 0x1000
|
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000
|
; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000
|
||||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6
|
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4
|
||||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||||
; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
|
; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
|
||||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
|
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6
|
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v4, v2, s6
|
; CHECK-NEXT: v_mul_hi_u32 v4, v2, s4
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
|
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
|
||||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
|
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
|
||||||
; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
|
; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
|
||||||
@ -302,9 +302,9 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
|
|||||||
; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3
|
; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3
|
||||||
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0
|
; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v2, v2, s6
|
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2
|
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2
|
||||||
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
|
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
|
||||||
; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v3, v1
|
; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v3, v1
|
||||||
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
|
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
|
||||||
; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v3, v1
|
; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v3, v1
|
||||||
@ -320,9 +320,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
|
|||||||
; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
|
; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
|
||||||
; CHECK: ; %bb.0:
|
; CHECK: ; %bb.0:
|
||||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; CHECK-NEXT: s_movk_i32 s8, 0x1000
|
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000
|
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000
|
||||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s8
|
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4
|
||||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
|
; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
|
||||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
|
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
|
||||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
|
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
|
||||||
@ -330,9 +330,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
|
|||||||
; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
|
; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
|
||||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
|
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
|
||||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
|
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8
|
; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v6, v3, s8
|
; CHECK-NEXT: v_mul_hi_u32 v6, v3, s4
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2
|
; CHECK-NEXT: v_lshlrev_b32_e32 v7, 12, v4
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2
|
; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
|
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
|
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
|
||||||
@ -350,11 +350,11 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
|
|||||||
; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
|
; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0
|
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0
|
||||||
; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1
|
; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v3, v3, s8
|
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v3
|
||||||
; CHECK-NEXT: v_mul_lo_u32 v4, v4, v2
|
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 12, v4
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3
|
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3
|
||||||
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v4
|
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v4
|
||||||
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s8, v5
|
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
|
||||||
; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v5, v2
|
; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v5, v2
|
||||||
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
|
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
|
||||||
; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v5, v2
|
; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v5, v2
|
||||||
|
Loading…
x
Reference in New Issue
Block a user