mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 04:02:41 +01:00
Revert "[X86] Support AMX fast register allocation"
This reverts commit 3b8ec86fd576b9808dc63da620d9a4f7bbe04372. Revert "[X86] Refine AMX fast register allocation" This reverts commit c3f95e9197643b699b891ca416ce7d72cf89f5fc. This pass breaks using LLVM in a multi-threaded environment by introducing global state.
This commit is contained in:
parent
8e0f77764c
commit
0c09b6edfc
@ -501,9 +501,6 @@ namespace llvm {
|
||||
/// or split the data to two <128 x i32>.
|
||||
FunctionPass *createX86LowerAMXTypePass();
|
||||
|
||||
/// The pass insert tile config intrinsics for AMX fast register allocation.
|
||||
FunctionPass *createX86PreAMXConfigPass();
|
||||
|
||||
/// The pass transforms amx intrinsics to scalar operation if the function has
|
||||
/// optnone attribute or it is O0.
|
||||
FunctionPass *createX86LowerAMXIntrinsicsPass();
|
||||
|
@ -406,10 +406,6 @@ protected:
|
||||
return false;
|
||||
}
|
||||
|
||||
/// addPostFastRegAllocRewrite - Add passes to the optimized register
|
||||
/// allocation pipeline after fast register allocation is complete.
|
||||
virtual bool addPostFastRegAllocRewrite() { return false; }
|
||||
|
||||
/// Add passes to be run immediately after virtual registers are rewritten
|
||||
/// to physical registers.
|
||||
virtual void addPostRewrite() { }
|
||||
|
@ -1316,10 +1316,6 @@ bool TargetPassConfig::addRegAssignAndRewriteFast() {
|
||||
report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
|
||||
|
||||
addPass(createRegAllocPass(false));
|
||||
|
||||
// Allow targets to change the register assignments after
|
||||
// fast register allocation.
|
||||
addPostFastRegAllocRewrite();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -34,10 +34,8 @@ set(sources
|
||||
X86DiscriminateMemOps.cpp
|
||||
X86LowerTileCopy.cpp
|
||||
X86LowerAMXType.cpp
|
||||
X86PreAMXConfig.cpp
|
||||
X86LowerAMXIntrinsics.cpp
|
||||
X86TileConfig.cpp
|
||||
X86FastTileConfig.cpp
|
||||
X86PreTileConfig.cpp
|
||||
X86ExpandPseudo.cpp
|
||||
X86FastISel.cpp
|
||||
|
@ -79,9 +79,6 @@ FunctionPass *createX86WinAllocaExpander();
|
||||
/// Return a pass that config the tile registers.
|
||||
FunctionPass *createX86TileConfigPass();
|
||||
|
||||
/// Return a pass that config the tile registers after fast reg allocation.
|
||||
FunctionPass *createX86FastTileConfigPass();
|
||||
|
||||
/// Return a pass that insert pseudo tile config instruction.
|
||||
FunctionPass *createX86PreTileConfigPass();
|
||||
|
||||
@ -175,10 +172,8 @@ void initializeX86PartialReductionPass(PassRegistry &);
|
||||
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
|
||||
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
|
||||
void initializeX86PreTileConfigPass(PassRegistry &);
|
||||
void initializeX86FastTileConfigPass(PassRegistry &);
|
||||
void initializeX86TileConfigPass(PassRegistry &);
|
||||
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
|
||||
void initializeX86PreAMXConfigPassPass(PassRegistry &);
|
||||
void initializeX86LowerTileCopyPass(PassRegistry &);
|
||||
void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
|
||||
|
||||
|
@ -1,306 +0,0 @@
|
||||
//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file Pass to config the shape of AMX physical registers
|
||||
/// AMX register need to be configured before use. Before FastRegAllocation pass
|
||||
/// the ldtilecfg instruction is inserted, however at that time we don't
|
||||
/// know the shape of each physical tile registers, because the register
|
||||
/// allocation is not done yet. This pass runs after egister allocation
|
||||
/// pass. It collects the shape information of each physical tile register
|
||||
/// and store the shape in the stack slot that is allocated for load config
|
||||
/// to tile config register.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "X86.h"
|
||||
#include "X86InstrBuilder.h"
|
||||
#include "X86MachineFunctionInfo.h"
|
||||
#include "X86RegisterInfo.h"
|
||||
#include "X86Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "fasttileconfig"
|
||||
|
||||
namespace {
|
||||
|
||||
class X86FastTileConfig : public MachineFunctionPass {
|
||||
// context
|
||||
MachineFunction *MF = nullptr;
|
||||
const X86Subtarget *ST = nullptr;
|
||||
const TargetRegisterInfo *TRI = nullptr;
|
||||
const TargetInstrInfo *TII = nullptr;
|
||||
MachineRegisterInfo *MRI = nullptr;
|
||||
|
||||
MachineInstr *getTileConfigPoint();
|
||||
void tileConfig();
|
||||
|
||||
public:
|
||||
X86FastTileConfig() : MachineFunctionPass(ID) {}
|
||||
|
||||
bool fastTileConfig();
|
||||
bool isTileLoad(MachineInstr &MI);
|
||||
bool isTileStore(MachineInstr &MI);
|
||||
bool isAMXInstr(MachineInstr &MI);
|
||||
void getTileStoreShape(MachineInstr &MI,
|
||||
SmallVector<MachineOperand *> &ShapedTiles);
|
||||
|
||||
MachineInstr *getKeyAMXInstr(MachineInstr *MI);
|
||||
void getTileShapesCfg(MachineInstr *MI,
|
||||
SmallVector<MachineOperand *> &ShapedTiles);
|
||||
void getShapeCfgInstrs(MachineInstr *MI,
|
||||
std::map<unsigned, MachineInstr *> &RowCfgs,
|
||||
std::map<unsigned, MachineInstr *> &ColCfgs);
|
||||
|
||||
/// Return the pass name.
|
||||
StringRef getPassName() const override {
|
||||
return "Fast Tile Register Configure";
|
||||
}
|
||||
|
||||
void materializeTileCfg(MachineInstr *MI);
|
||||
|
||||
void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
|
||||
std::map<unsigned, MachineInstr *> &RowCfgs,
|
||||
std::map<unsigned, MachineInstr *> &ColCfgs);
|
||||
|
||||
/// Perform register allocation.
|
||||
bool runOnMachineFunction(MachineFunction &MFunc) override;
|
||||
|
||||
MachineFunctionProperties getRequiredProperties() const override {
|
||||
return MachineFunctionProperties().set(
|
||||
MachineFunctionProperties::Property::NoPHIs);
|
||||
}
|
||||
|
||||
static char ID;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
char X86FastTileConfig::ID = 0;
|
||||
|
||||
INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
|
||||
"Fast Tile Register Configure", false, false)
|
||||
INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
|
||||
"Fast Tile Register Configure", false, false)
|
||||
|
||||
static bool isTilePhysReg(MachineOperand &Op) {
|
||||
if (!Op.isReg())
|
||||
return false;
|
||||
|
||||
Register Reg = Op.getReg();
|
||||
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static unsigned getTilePhysRegIdx(MachineOperand *Op) {
|
||||
assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
|
||||
return Op->getReg() - X86::TMM0;
|
||||
}
|
||||
|
||||
static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
|
||||
unsigned Offset = 48 + TIdx;
|
||||
MI->getOperand(3).ChangeToImmediate(Offset);
|
||||
}
|
||||
|
||||
static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
|
||||
unsigned Offset = 16 + TIdx * 2;
|
||||
MI->getOperand(3).ChangeToImmediate(Offset);
|
||||
}
|
||||
|
||||
bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
|
||||
return MI.getOpcode() == X86::PTILELOADDV;
|
||||
}
|
||||
bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
|
||||
return MI.getOpcode() == X86::PTILESTOREDV;
|
||||
}
|
||||
bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
|
||||
// TODO: May need to handle some special nontile amx instrucion.
|
||||
if (MI.getOpcode() == X86::LDTILECFG || MI.isDebugInstr())
|
||||
return false;
|
||||
|
||||
for (MachineOperand &MO : MI.operands())
|
||||
if (isTilePhysReg(MO))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
|
||||
auto Cfg = MachineBasicBlock::iterator(MI);
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
MachineInstr *KeyMI = nullptr;
|
||||
int KeyAMXNum = 0;
|
||||
|
||||
for (auto II = Cfg; II != MBB->end(); II++) {
|
||||
if (isTileLoad(*II)) {
|
||||
KeyMI = &*II;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isTileStore(*II)) {
|
||||
assert(KeyMI && "Key AMX Should be found before!");
|
||||
break;
|
||||
}
|
||||
|
||||
if (isAMXInstr(*II)) {
|
||||
assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
|
||||
KeyAMXNum++;
|
||||
KeyMI = &*II;
|
||||
}
|
||||
}
|
||||
assert(KeyMI && "There must be an AMX instruction.");
|
||||
return KeyMI;
|
||||
}
|
||||
|
||||
// Orderly get the tiles in key amx instruction, uses before defs.
|
||||
void X86FastTileConfig::getTileShapesCfg(
|
||||
MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
|
||||
MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);
|
||||
|
||||
SmallVector<MachineOperand *> DefTiles;
|
||||
for (MachineOperand &MO : KeyMI->operands()) {
|
||||
if (!isTilePhysReg(MO))
|
||||
continue;
|
||||
if (MO.isDef())
|
||||
DefTiles.push_back(&MO);
|
||||
else
|
||||
ShapedTiles.push_back(&MO);
|
||||
}
|
||||
ShapedTiles.append(DefTiles);
|
||||
}
|
||||
|
||||
// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
|
||||
// amx.shape.N.col*" at pass "Pre AMX Tile Config".
|
||||
// The 'N' implies the order of tiles in key amx intrinsic.
|
||||
void X86FastTileConfig::getShapeCfgInstrs(
|
||||
MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
|
||||
std::map<unsigned, MachineInstr *> &ColCfgs) {
|
||||
auto Cfg = MachineBasicBlock::iterator(MI);
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
|
||||
for (auto II = Cfg; II != MBB->begin(); II--) {
|
||||
if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
|
||||
break;
|
||||
if (!II->mayStore() || !II->hasOneMemOperand())
|
||||
continue;
|
||||
const Value *MemPtr = II->memoperands()[0]->getValue();
|
||||
if (!MemPtr)
|
||||
continue;
|
||||
|
||||
StringRef Name = MemPtr->getName();
|
||||
if (!Name.startswith("amx.tmm."))
|
||||
continue;
|
||||
|
||||
// Get the 'N'th tile shape config in key amx instruction.
|
||||
auto N = Name.find(".shape");
|
||||
StringRef STileIdx = Name.slice(8, N);
|
||||
unsigned Idx;
|
||||
STileIdx.getAsInteger(10, Idx);
|
||||
|
||||
// And related them with their store instructions.
|
||||
if (Name.contains("row"))
|
||||
RowCfgs[Idx] = &*II;
|
||||
else if (Name.contains("col"))
|
||||
ColCfgs[Idx] = &*II;
|
||||
else
|
||||
llvm_unreachable("Invalid tile shape info!");
|
||||
}
|
||||
assert((RowCfgs.size() == ColCfgs.size()) &&
|
||||
"The number of tile row and col must be equal!");
|
||||
}
|
||||
|
||||
// Here is the data format for the tile config.
|
||||
// 0 palette = 1 now.
|
||||
// 1 start_row = 0 now.
|
||||
// 2-15 reserved, must be zero
|
||||
// 16-17 tile0.colsb Tile 0 bytes per row.
|
||||
// 18-19 tile1.colsb Tile 1 bytes per row.
|
||||
// 20-21 tile2.colsb Tile 2 bytes per row.
|
||||
// ... (sequence continues)
|
||||
// 30-31 tile7.colsb Tile 7 bytes per row.
|
||||
// 32-47 reserved, must be zero
|
||||
// 48 tile0.rows Tile 0 rows.
|
||||
// 49 tile1.rows Tile 1 rows.
|
||||
// 50 tile2.rows Tile 2 rows.
|
||||
// ... (sequence continues)
|
||||
// 55 tile7.rows Tile 7 rows.
|
||||
// 56-63 reserved, must be zero
|
||||
void X86FastTileConfig::rewriteTileCfg(
|
||||
SmallVector<MachineOperand *> &ShapedTiles,
|
||||
std::map<unsigned, MachineInstr *> &RowCfgs,
|
||||
std::map<unsigned, MachineInstr *> &ColCfgs) {
|
||||
assert((RowCfgs.size() == ShapedTiles.size()) &&
|
||||
"The number of tile shapes not equal with the number of tiles!");
|
||||
|
||||
// Orderly get the tiles and adjust the shape config.
|
||||
for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
|
||||
MachineOperand *MO = ShapedTiles[I];
|
||||
unsigned TmmIdx = getTilePhysRegIdx(MO);
|
||||
if (I == TmmIdx)
|
||||
continue;
|
||||
adjustRowCfg(TmmIdx, RowCfgs[I]);
|
||||
adjustColCfg(TmmIdx, ColCfgs[I]);
|
||||
}
|
||||
}
|
||||
|
||||
// We have already preconfig the shapes before fast register allocation at
|
||||
// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
|
||||
// allocation, the shapes pre-written before may not rightly corresponding
|
||||
// to the correct tmm registers, so we need adjust them.
|
||||
void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
|
||||
SmallVector<MachineOperand *> ShapedTiles;
|
||||
std::map<unsigned, MachineInstr *> RowCfgs;
|
||||
std::map<unsigned, MachineInstr *> ColCfgs;
|
||||
|
||||
// Orderly keep the tile uses and def in ShapedTiles;
|
||||
getTileShapesCfg(CfgMI, ShapedTiles);
|
||||
assert(ShapedTiles.size() && "Not find shapes config!");
|
||||
|
||||
getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);
|
||||
|
||||
rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
|
||||
}
|
||||
|
||||
bool X86FastTileConfig::fastTileConfig() {
|
||||
bool Changed = false;
|
||||
|
||||
for (MachineBasicBlock &MBB : *MF) {
|
||||
SmallVector<MachineInstr *, 2> CFGs;
|
||||
for (MachineInstr &MI : MBB)
|
||||
if (MI.getOpcode() == X86::LDTILECFG)
|
||||
CFGs.push_back(&MI);
|
||||
for (auto *MI : CFGs)
|
||||
materializeTileCfg(MI);
|
||||
if (!CFGs.empty())
|
||||
Changed = true;
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
|
||||
MF = &MFunc;
|
||||
MRI = &MFunc.getRegInfo();
|
||||
ST = &MFunc.getSubtarget<X86Subtarget>();
|
||||
TRI = ST->getRegisterInfo();
|
||||
TII = MFunc.getSubtarget().getInstrInfo();
|
||||
|
||||
return fastTileConfig();
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createX86FastTileConfigPass() {
|
||||
return new X86FastTileConfig();
|
||||
}
|
@ -34,7 +34,6 @@
|
||||
#include "llvm/IR/PatternMatch.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/LoopUtils.h"
|
||||
@ -53,10 +52,6 @@ static bool isV256I32Ty(Type *Ty) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static cl::opt<bool>
|
||||
X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), cl::Hidden,
|
||||
cl::desc("X86: enable AMX scalarizition."));
|
||||
|
||||
namespace {
|
||||
class X86LowerAMXIntrinsics {
|
||||
Function &Func;
|
||||
@ -98,7 +93,6 @@ private:
|
||||
lowerTileDP(Instruction *TileDP);
|
||||
bool lowerTileZero(Instruction *TileZero);
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader,
|
||||
BasicBlock *Exit, Value *Bound,
|
||||
@ -630,6 +624,9 @@ bool X86LowerAMXIntrinsics::visit() {
|
||||
|
||||
return C;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
namespace {
|
||||
|
||||
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
|
||||
public:
|
||||
@ -641,8 +638,6 @@ public:
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
if (!X86ScalarizeAMX)
|
||||
return false;
|
||||
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
|
||||
if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
|
||||
TM->getOptLevel() != CodeGenOpt::None)
|
||||
@ -666,6 +661,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
static const char PassName[] = "Lower AMX intrinsics";
|
||||
char X86LowerAMXIntrinsicsLegacyPass::ID = 0;
|
||||
INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
|
||||
|
@ -1,4 +1,4 @@
|
||||
//===- Target/X86/X86LowerAMXType.cpp - -------------------------*- C++ -*-===//
|
||||
//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
@ -14,27 +14,6 @@
|
||||
/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
|
||||
/// not be combined with load/store, we transform the bitcast to amx load/store
|
||||
/// and <256 x i32> store/load.
|
||||
///
|
||||
/// If Front End not use O0 but the Mid/Back end use O0, (e.g. "Clang -O2 -S
|
||||
/// -emit-llvm t.c" + "llc t.ll") we should make sure the amx data is volatile,
|
||||
/// because that is necessary for AMX fast register allocation. (In Fast
|
||||
/// registera allocation, register will be allocated before spill/reload, so
|
||||
/// there is no additional register for amx to identify the step in spill.)
|
||||
/// The volatileTileData() will handle this case.
|
||||
/// e.g.
|
||||
/// ----------------------------------------------------------
|
||||
/// | def %td = ... |
|
||||
/// | ... |
|
||||
/// | "use %td" |
|
||||
/// ----------------------------------------------------------
|
||||
/// will transfer to -->
|
||||
/// ----------------------------------------------------------
|
||||
/// | def %td = ... |
|
||||
/// | call void @llvm.x86.tilestored64.internal(mem, %td) |
|
||||
/// | ... |
|
||||
/// | %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)|
|
||||
/// | "use %td2" |
|
||||
/// ----------------------------------------------------------
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
@ -62,13 +41,7 @@ using namespace PatternMatch;
|
||||
|
||||
#define DEBUG_TYPE "lower-amx-type"
|
||||
|
||||
// In AMX intrinsics we let Shape = {Row, Col}, but the
|
||||
// RealCol = Col / ElementSize. We may use the RealCol
|
||||
// as a new Row for other new created AMX intrinsics.
|
||||
static std::map<Value *, Value *> Col2Row;
|
||||
|
||||
static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder,
|
||||
BasicBlock *BB) {
|
||||
static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
|
||||
Function &F = *BB->getParent();
|
||||
Module *M = BB->getModule();
|
||||
const DataLayout &DL = M->getDataLayout();
|
||||
@ -83,36 +56,7 @@ static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder,
|
||||
return AllocaRes;
|
||||
}
|
||||
|
||||
static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) {
|
||||
if (Col2Row.count(V))
|
||||
return Col2Row[V];
|
||||
IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt());
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
BasicBlock::iterator Iter = I->getIterator();
|
||||
++Iter;
|
||||
Builder.SetInsertPoint(&*Iter);
|
||||
}
|
||||
ConstantInt *Gran = Builder.getInt16(Granularity);
|
||||
Value *RealRow = Builder.CreateUDiv(V, Gran);
|
||||
Col2Row[V] = RealRow;
|
||||
return RealRow;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class X86LowerAMXType {
|
||||
Function &Func;
|
||||
TargetMachine *TM = nullptr;
|
||||
|
||||
public:
|
||||
X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {}
|
||||
bool visit();
|
||||
void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
|
||||
void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
|
||||
bool transformBitcast(BitCastInst *Bitcast);
|
||||
std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
|
||||
};
|
||||
|
||||
std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II, unsigned OpNo) {
|
||||
static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
|
||||
Value *Row = nullptr, *Col = nullptr;
|
||||
switch (II->getIntrinsicID()) {
|
||||
default:
|
||||
@ -141,13 +85,6 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II, unsigne
|
||||
break;
|
||||
case 5:
|
||||
Row = II->getArgOperand(2);
|
||||
// FIXME: There is a design bug for AMX shape, which the Col should be
|
||||
// Col/4 if it will be used as Row, but current Greedy RA can't handle
|
||||
// this case well, it may failed if we generate a new Shape definition.
|
||||
// So Let's just do it in O0 first.
|
||||
// Row = Row / 4
|
||||
if (TM->getOptLevel() == CodeGenOpt::None)
|
||||
Row = getRowFromCol(II, Row, 4);
|
||||
Col = II->getArgOperand(1);
|
||||
break;
|
||||
}
|
||||
@ -163,7 +100,7 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II, unsigne
|
||||
// -->
|
||||
// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
|
||||
// i8* %addr, i64 %stride64)
|
||||
void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
|
||||
static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
|
||||
Value *Row = nullptr, *Col = nullptr;
|
||||
Use &U = *(Bitcast->use_begin());
|
||||
unsigned OpNo = U.getOperandNo();
|
||||
@ -188,7 +125,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
|
||||
// -->
|
||||
// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
|
||||
// %stride64, %13)
|
||||
void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
|
||||
static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
|
||||
|
||||
Value *Tile = Bitcast->getOperand(0);
|
||||
auto *II = cast<IntrinsicInst>(Tile);
|
||||
@ -220,14 +157,14 @@ void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
|
||||
}
|
||||
|
||||
// transform bitcast to <store, load> instructions.
|
||||
bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
|
||||
static bool transformBitcast(BitCastInst *Bitcast) {
|
||||
IRBuilder<> Builder(Bitcast);
|
||||
AllocaInst *AllocaAddr;
|
||||
Value *I8Ptr, *Stride;
|
||||
auto *Src = Bitcast->getOperand(0);
|
||||
|
||||
auto Prepare = [&]() {
|
||||
AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent());
|
||||
AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
|
||||
I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
|
||||
Stride = Builder.getInt64(64);
|
||||
};
|
||||
@ -278,9 +215,17 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class X86LowerAMXType {
|
||||
Function &Func;
|
||||
|
||||
public:
|
||||
X86LowerAMXType(Function &F) : Func(F) {}
|
||||
bool visit();
|
||||
};
|
||||
|
||||
bool X86LowerAMXType::visit() {
|
||||
SmallVector<Instruction *, 8> DeadInsts;
|
||||
Col2Row.clear();
|
||||
|
||||
for (BasicBlock *BB : post_order(&Func)) {
|
||||
for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
|
||||
@ -377,260 +322,6 @@ bool X86LowerAMXType::visit() {
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
static Value *getAllocaPos(BasicBlock *BB) {
|
||||
Module *M = BB->getModule();
|
||||
Function *F = BB->getParent();
|
||||
IRBuilder<> Builder(&F->getEntryBlock().front());
|
||||
const DataLayout &DL = M->getDataLayout();
|
||||
unsigned AllocaAS = DL.getAllocaAddrSpace();
|
||||
Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
|
||||
AllocaInst *AllocaRes =
|
||||
new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front());
|
||||
BasicBlock::iterator Iter = AllocaRes->getIterator();
|
||||
++Iter;
|
||||
Builder.SetInsertPoint(&*Iter);
|
||||
Value *I8Ptr = Builder.CreateBitCast(AllocaRes, Builder.getInt8PtrTy());
|
||||
return I8Ptr;
|
||||
}
|
||||
|
||||
static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
|
||||
assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
|
||||
auto *II = cast<IntrinsicInst>(TileDef);
|
||||
assert(II && "Not tile intrinsic!");
|
||||
Value *Row = II->getOperand(0);
|
||||
Value *Col = II->getOperand(1);
|
||||
|
||||
BasicBlock *BB = TileDef->getParent();
|
||||
BasicBlock::iterator Iter = TileDef->getIterator();
|
||||
IRBuilder<> Builder(BB, ++Iter);
|
||||
Value *Stride = Builder.getInt64(64);
|
||||
std::array<Value *, 5> Args = {Row, Col, Ptr, Stride, TileDef};
|
||||
|
||||
Instruction *TileStore =
|
||||
Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
|
||||
return TileStore;
|
||||
}
|
||||
|
||||
static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
|
||||
Value *V = U.get();
|
||||
assert(V->getType()->isX86_AMXTy() && "Not define tile!");
|
||||
|
||||
// Get tile shape.
|
||||
IntrinsicInst *II = nullptr;
|
||||
if (IsPHI) {
|
||||
Value *PhiOp = dyn_cast<PHINode>(V)->getIncomingValue(0);
|
||||
II = cast<IntrinsicInst>(PhiOp);
|
||||
} else {
|
||||
II = cast<IntrinsicInst>(V);
|
||||
}
|
||||
Value *Row = II->getOperand(0);
|
||||
Value *Col = II->getOperand(1);
|
||||
|
||||
Instruction *UserI = dyn_cast<Instruction>(U.getUser());
|
||||
IRBuilder<> Builder(UserI);
|
||||
Value *Stride = Builder.getInt64(64);
|
||||
std::array<Value *, 4> Args = {Row, Col, Ptr, Stride};
|
||||
|
||||
Value *TileLoad =
|
||||
Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
|
||||
UserI->replaceUsesOfWith(V, TileLoad);
|
||||
}
|
||||
|
||||
static bool isIncomingOfPHI(Instruction *I) {
|
||||
for (Use &U : I->uses()) {
|
||||
User *V = U.getUser();
|
||||
if (isa<PHINode>(V))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Let all AMX tile data become volatile data, shorten the life range
|
||||
// of each tile register before fast register allocation.
|
||||
namespace {
|
||||
class X86VolatileTileData {
|
||||
Function &F;
|
||||
|
||||
public:
|
||||
X86VolatileTileData(Function &Func) : F(Func) {}
|
||||
Value *updatePhiIncomings(BasicBlock *BB,
|
||||
SmallVector<Instruction *, 2> &Imcomings);
|
||||
void replacePhiDefWithLoad(Instruction *PHI, Value *StorePtr);
|
||||
bool volatileTileData();
|
||||
void volatileTilePHI(PHINode *Inst);
|
||||
void volatileTileNonPHI(Instruction *I);
|
||||
};
|
||||
|
||||
Value *X86VolatileTileData::updatePhiIncomings(
|
||||
BasicBlock *BB, SmallVector<Instruction *, 2> &Imcomings) {
|
||||
Value *I8Ptr = getAllocaPos(BB);
|
||||
|
||||
for (auto *I : Imcomings) {
|
||||
User *Store = createTileStore(I, I8Ptr);
|
||||
|
||||
// All its uses (except phi) should load from stored mem.
|
||||
for (Use &U : I->uses()) {
|
||||
User *V = U.getUser();
|
||||
if (isa<PHINode>(V) || V == Store)
|
||||
continue;
|
||||
replaceWithTileLoad(U, I8Ptr);
|
||||
}
|
||||
}
|
||||
return I8Ptr;
|
||||
}
|
||||
|
||||
void X86VolatileTileData::replacePhiDefWithLoad(Instruction *PHI,
|
||||
Value *StorePtr) {
|
||||
for (Use &U : PHI->uses())
|
||||
replaceWithTileLoad(U, StorePtr, true);
|
||||
PHI->eraseFromParent();
|
||||
}
|
||||
|
||||
// Smilar with volatileTileNonPHI, this function only handle PHI Nodes
|
||||
// and their related AMX intrinsics.
|
||||
// 1) PHI Def should change to tileload.
|
||||
// 2) PHI Incoming Values should tilestored in just after their def.
|
||||
// 3) The mem of these tileload and tilestores should be same.
|
||||
// e.g.
|
||||
// ------------------------------------------------------
|
||||
// bb_dom:
|
||||
// ...
|
||||
// br i1 %bool.cond, label %if.else, label %if.then
|
||||
//
|
||||
// if.then:
|
||||
// def %t0 = ...
|
||||
// ...
|
||||
// use %t0
|
||||
// ...
|
||||
// br label %if.end
|
||||
//
|
||||
// if.else:
|
||||
// def %t1 = ...
|
||||
// br label %if.end
|
||||
//
|
||||
// if.end:
|
||||
// %td = phi x86_amx [ %t1, %if.else ], [ %t0, %if.then ]
|
||||
// ...
|
||||
// use %td
|
||||
// ------------------------------------------------------
|
||||
// -->
|
||||
// ------------------------------------------------------
|
||||
// bb_entry:
|
||||
// %mem = alloca <256 x i32>, align 1024 *
|
||||
// ...
|
||||
// bb_dom:
|
||||
// ...
|
||||
// br i1 %bool.cond, label %if.else, label %if.then
|
||||
//
|
||||
// if.then:
|
||||
// def %t0 = ...
|
||||
// call void @llvm.x86.tilestored64.internal(mem, %t0) *
|
||||
// ...
|
||||
// %t0` = call x86_amx @llvm.x86.tileloadd64.internal(mem)*
|
||||
// use %t0` *
|
||||
// ...
|
||||
// br label %if.end
|
||||
//
|
||||
// if.else:
|
||||
// def %t1 = ...
|
||||
// call void @llvm.x86.tilestored64.internal(mem, %t1) *
|
||||
// br label %if.end
|
||||
//
|
||||
// if.end:
|
||||
// ...
|
||||
// %td = call x86_amx @llvm.x86.tileloadd64.internal(mem) *
|
||||
// use %td
|
||||
// ------------------------------------------------------
|
||||
void X86VolatileTileData::volatileTilePHI(PHINode *PHI) {
|
||||
BasicBlock *BB = PHI->getParent();
|
||||
SmallVector<Instruction *, 2> Imcomings;
|
||||
|
||||
for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
|
||||
Value *Op = PHI->getIncomingValue(I);
|
||||
Instruction *Inst = dyn_cast<Instruction>(Op);
|
||||
assert(Inst && "We shouldn't fold AMX instrution!");
|
||||
Imcomings.push_back(Inst);
|
||||
}
|
||||
|
||||
Value *StorePtr = updatePhiIncomings(BB, Imcomings);
|
||||
replacePhiDefWithLoad(PHI, StorePtr);
|
||||
}
|
||||
|
||||
// Store the defined tile and load it before use.
|
||||
// All its users are not PHI.
|
||||
// e.g.
|
||||
// ------------------------------------------------------
|
||||
// def %td = ...
|
||||
// ...
|
||||
// "use %td"
|
||||
// ------------------------------------------------------
|
||||
// -->
|
||||
// ------------------------------------------------------
|
||||
// def %td = ...
|
||||
// call void @llvm.x86.tilestored64.internal(mem, %td)
|
||||
// ...
|
||||
// %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)
|
||||
// "use %td2"
|
||||
// ------------------------------------------------------
|
||||
void X86VolatileTileData::volatileTileNonPHI(Instruction *I) {
|
||||
BasicBlock *BB = I->getParent();
|
||||
Value *I8Ptr = getAllocaPos(BB);
|
||||
User *Store = createTileStore(I, I8Ptr);
|
||||
|
||||
// All its uses should load from stored mem.
|
||||
for (Use &U : I->uses()) {
|
||||
User *V = U.getUser();
|
||||
assert(!isa<PHINode>(V) && "PHI Nodes should be excluded!");
|
||||
if (V != Store)
|
||||
replaceWithTileLoad(U, I8Ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// Volatile Tile Model:
|
||||
// 1) All the uses of tile data comes from tileload in time.
|
||||
// 2) All the defs of tile data tilestore into mem immediately.
|
||||
// For example:
|
||||
// --------------------------------------------------------------------------
|
||||
// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
|
||||
// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
|
||||
// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
|
||||
// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
|
||||
// call void @llvm.x86.tilestored64.internal(... td) area
|
||||
// --------------------------------------------------------------------------
|
||||
// 3) No terminator, call or other amx instructions in the key amx area.
|
||||
bool X86VolatileTileData::volatileTileData() {
|
||||
bool Changed = false;
|
||||
for (BasicBlock &BB : F) {
|
||||
SmallVector<Instruction *, 2> PHIInsts;
|
||||
SmallVector<Instruction *, 8> AMXDefInsts;
|
||||
|
||||
for (Instruction &I : BB) {
|
||||
if (!I.getType()->isX86_AMXTy())
|
||||
continue;
|
||||
if (isa<PHINode>(&I))
|
||||
PHIInsts.push_back(&I);
|
||||
else
|
||||
AMXDefInsts.push_back(&I);
|
||||
}
|
||||
|
||||
// First we "volatile" the non-phi related amx intrinsics.
|
||||
for (Instruction *I : AMXDefInsts) {
|
||||
if (isIncomingOfPHI(I))
|
||||
continue;
|
||||
volatileTileNonPHI(I);
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
for (Instruction *I : PHIInsts) {
|
||||
volatileTilePHI(dyn_cast<PHINode>(I));
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
namespace {
|
||||
|
||||
class X86LowerAMXTypeLegacyPass : public FunctionPass {
|
||||
@ -643,24 +334,11 @@ public:
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
|
||||
|
||||
X86LowerAMXType LAT(F, TM);
|
||||
if (F.hasFnAttribute(Attribute::OptimizeNone) ||
|
||||
TM->getOptLevel() == CodeGenOpt::None)
|
||||
return false;
|
||||
X86LowerAMXType LAT(F);
|
||||
bool C = LAT.visit();
|
||||
|
||||
// Prepare for fast register allocation at O0.
|
||||
// Todo: May better check the volatile model of AMX code, not just
|
||||
// by checking Attribute::OptimizeNone and CodeGenOpt::None.
|
||||
if (TM->getOptLevel() == CodeGenOpt::None) {
|
||||
// If Front End not use O0 but the Mid/Back end use O0, (e.g.
|
||||
// "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make
|
||||
// sure the amx data is volatile, that is nessary for AMX fast
|
||||
// register allocation.
|
||||
if (!F.hasFnAttribute(Attribute::OptimizeNone)) {
|
||||
X86VolatileTileData VTD(F);
|
||||
C = VTD.volatileTileData() || C;
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
|
@ -1,422 +0,0 @@
|
||||
//===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// Insert tilecfg for each area of key AMX intrinsic.
|
||||
/// All the key AMX intrinsic's tile operand must come from tileload. And the
|
||||
/// def tile of key AMX intrinsic must be tilestored.
|
||||
/// take tdpbssd for example:
|
||||
/// --------------------------------------------------------------------------
|
||||
/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key
|
||||
/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) |
|
||||
/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx
|
||||
/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) |
|
||||
/// call void @llvm.x86.tilestored64.internal(... td) area
|
||||
/// --------------------------------------------------------------------------
|
||||
/// This pass will insert tilecfg before every key-amx-area, some like:
|
||||
/// --------------------------------------------------------------------------
|
||||
/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
|
||||
/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
|
||||
/// ...
|
||||
/// ... pre-config shape of %t1 *
|
||||
/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
|
||||
/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
|
||||
/// ... *
|
||||
/// ... pre-config shape of %t2 * shapes
|
||||
/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 *
|
||||
/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
|
||||
/// ...
|
||||
/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
#include "X86.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/CodeGen/ValueTypes.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/IntrinsicsX86.h"
|
||||
#include "llvm/IR/PatternMatch.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace PatternMatch;
|
||||
|
||||
#define DEBUG_TYPE "pre-amx-config"
|
||||
|
||||
static bool isAMXIntrinsic(IntrinsicInst *II) {
|
||||
for (Value *Operand : II->operands())
|
||||
if (Operand->getType()->isX86_AMXTy())
|
||||
return true;
|
||||
return II->getType()->isX86_AMXTy();
|
||||
}
|
||||
|
||||
static bool isTileLoad(IntrinsicInst *II) {
|
||||
return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
|
||||
}
|
||||
|
||||
static bool isTileStore(IntrinsicInst *II) {
|
||||
return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
static bool onlyTileDef(IntrinsicInst *II) {
|
||||
for (Value *Operand : II->operands())
|
||||
if (Operand->getType()->isX86_AMXTy())
|
||||
return false;
|
||||
return II->getType()->isX86_AMXTy();
|
||||
}
|
||||
|
||||
static bool brokenVolatile(Instruction *I) {
|
||||
// Todo: it is weak to identify a normal call here.
|
||||
if ((isa<CallInst>(I) && !isa<IntrinsicInst>(I)) || I->isTerminator())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
class X86PreAMXConfig {
|
||||
Function &F;
|
||||
|
||||
public:
|
||||
X86PreAMXConfig(Function &Func) : F(Func) {}
|
||||
bool preTileConfig();
|
||||
bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
|
||||
bool findConfigShapes(
|
||||
DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes);
|
||||
bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
|
||||
bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
|
||||
SmallVector<Value *, 8> &Shapes);
|
||||
BasicBlock::iterator
|
||||
getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
|
||||
SmallVector<Value *, 8> &Shapes);
|
||||
bool checkVolatileModel(SmallSet<Value *, 4> &Loads, IntrinsicInst *Store,
|
||||
IntrinsicInst *KeyAMX);
|
||||
};
|
||||
|
||||
// Orderly write the shapes in tilecfg's mem. This maybe not right.
|
||||
// Because the first shape may not corresponding to the first tmm register,
|
||||
// so we need to handle at at X86FastTileConfig::materializeTileCfg()
|
||||
// after register allocation.
|
||||
// For example:
|
||||
// --------------------------------------------------------------------------
|
||||
// zeroinitialize tilecfg's mem (of ldtilecfg)
|
||||
// --------------------------------------------------------------------------
|
||||
// ... pre-config shape of %t1 *
|
||||
// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 *
|
||||
// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 *
|
||||
// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
|
||||
// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
|
||||
// ... *
|
||||
// ... pre-config shape of %t2 *
|
||||
// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 *
|
||||
// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 *
|
||||
// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
|
||||
// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
|
||||
// ... *
|
||||
// ... pre-config shape of %t3 * of
|
||||
// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 *
|
||||
// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 *
|
||||
// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
|
||||
// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
|
||||
// ... * tiles
|
||||
// ... pre-config shape of %td *
|
||||
// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 *
|
||||
// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 *
|
||||
// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
|
||||
// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
|
||||
// --------------------------------------------------------------------------
|
||||
// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config
|
||||
// --------------------------------------------------------------------------
|
||||
// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
|
||||
// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
|
||||
// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
|
||||
// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
|
||||
// call void @llvm.x86.tilestored64.internal(... td) area
|
||||
// --------------------------------------------------------------------------
|
||||
bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
|
||||
SmallVector<Value *, 8> &Shapes) {
|
||||
bool Write = false;
|
||||
LLVMContext &Ctx = Pos->getParent()->getContext();
|
||||
Type *I8Ty = Type::getInt8Ty(Ctx);
|
||||
Type *I16Ty = Type::getInt16Ty(Ctx);
|
||||
|
||||
// TODO: Currently we defaultly set Palette = 1, it may be assigned to
|
||||
// other value in the future.
|
||||
Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
|
||||
Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
|
||||
Value *PalettePos =
|
||||
GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
|
||||
new StoreInst(PaletteValue, PalettePos, Pos);
|
||||
|
||||
for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
|
||||
Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
|
||||
Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
|
||||
const std::string ShapeName = "amx.tmm." + itostr(I);
|
||||
Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
|
||||
ShapeName + ".shape.row", Pos);
|
||||
Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
|
||||
ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
|
||||
ShapeName + ".shape.col", Pos);
|
||||
Value *Row = Shapes[I * 2];
|
||||
Value *Col = Shapes[I * 2 + 1];
|
||||
Row = new TruncInst(Row, I8Ty, "", Pos);
|
||||
new StoreInst(Row, RowPos, Pos);
|
||||
new StoreInst(Col, ColPos, Pos);
|
||||
Write = true;
|
||||
}
|
||||
return Write;
|
||||
}
|
||||
|
||||
bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
|
||||
SmallVector<Value *, 8> &Shapes) {
|
||||
Module *M = F.getParent();
|
||||
IRBuilder<> Builder(ModelStart);
|
||||
const DataLayout &DL = M->getDataLayout();
|
||||
unsigned AddrSpace = DL.getAllocaAddrSpace();
|
||||
LLVMContext &Ctx = Builder.getContext();
|
||||
Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false);
|
||||
Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx));
|
||||
|
||||
AllocaInst *Addr =
|
||||
new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front());
|
||||
Addr->setAlignment(Alignment);
|
||||
Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
|
||||
|
||||
std::array<Value *, 1> Args = {I8Ptr};
|
||||
Instruction *Cfg =
|
||||
Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg, None, Args);
|
||||
|
||||
Value *Val0 = Constant::getNullValue(V512Ty);
|
||||
Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
|
||||
assert(Init0 && "Not Zero initilizate the cfg mem!");
|
||||
|
||||
preWriteTileCfg(I8Ptr, Cfg, Shapes);
|
||||
|
||||
return Init0;
|
||||
}
|
||||
|
||||
// Todo: We may need to handle "more than one store" case in the future.
|
||||
bool X86PreAMXConfig::checkVolatileModel(SmallSet<Value *, 4> &Loads,
|
||||
IntrinsicInst *Store,
|
||||
IntrinsicInst *KeyAMX) {
|
||||
Value *ST = Store->getOperand(4);
|
||||
|
||||
// Only has tileload and tilestore.
|
||||
if (!KeyAMX)
|
||||
return (Loads.size() == 1) && Loads.contains(ST);
|
||||
|
||||
// All Loads should be operands of KeyAMX.
|
||||
// All tile operands of KeyAMX should come from Loads.
|
||||
for (Value *Op : KeyAMX->operands()) {
|
||||
if (Op->getType()->isX86_AMXTy())
|
||||
if (!Loads.erase(Op))
|
||||
return false;
|
||||
}
|
||||
|
||||
// The def of KeyAMX should be stored into mem.
|
||||
// Todo: is it key amx can be no def?
|
||||
return Loads.empty() && (ST == cast<Value>(KeyAMX));
|
||||
}
|
||||
|
||||
bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX,
|
||||
SmallVector<Value *, 8> &Shapes) {
|
||||
for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) {
|
||||
Value *Op = KeyAMX->getOperand(I);
|
||||
if (!Op->getType()->isX86_AMXTy())
|
||||
continue;
|
||||
IntrinsicInst *TileDef = dyn_cast<IntrinsicInst>(Op);
|
||||
assert((TileDef && isTileLoad(TileDef)) &&
|
||||
"All KeyAMX's tile definiation should comes from TileLoad!");
|
||||
Shapes.push_back(TileDef->getOperand(0));
|
||||
Shapes.push_back(TileDef->getOperand(1));
|
||||
}
|
||||
if (!isTileStore(KeyAMX)) {
|
||||
Shapes.push_back(KeyAMX->getOperand(0));
|
||||
Shapes.push_back(KeyAMX->getOperand(1));
|
||||
}
|
||||
return Shapes.size() != 0;
|
||||
}
|
||||
|
||||
// Collect the shapes and skip the area of current key amx intrinsic.
|
||||
//
|
||||
// For example:
|
||||
// ...
|
||||
// --------------------------------------------------------------------------
|
||||
// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k)
|
||||
// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k)
|
||||
// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k)
|
||||
// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3)
|
||||
// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k)
|
||||
// --------------------------------------------------------------------------
|
||||
BasicBlock::iterator
|
||||
X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
|
||||
SmallVector<Value *, 8> &Shapes) {
|
||||
IntrinsicInst *KeyAMX = nullptr;
|
||||
BasicBlock *BB = Iter->getParent();
|
||||
BasicBlock::iterator PosEnd = BB->end();
|
||||
SmallSet<Value *, 4> Loads;
|
||||
|
||||
// See TileStore as "Config Position End" and check volatile model.
|
||||
for (auto I = Iter, E = BB->end(); I != E; ++I) {
|
||||
assert(!brokenVolatile(&*I) && "Not reach tile store!");
|
||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
|
||||
if (!II || !isAMXIntrinsic(II))
|
||||
continue;
|
||||
|
||||
if (isTileLoad(II)) {
|
||||
Loads.insert(II);
|
||||
} else if (isTileStore(II)) {
|
||||
if (!checkVolatileModel(Loads, II, KeyAMX))
|
||||
report_fatal_error("Not Volatile AMX Model!");
|
||||
PosEnd = I;
|
||||
break;
|
||||
} else {
|
||||
assert(!KeyAMX && "Too many key amx intrinsic!");
|
||||
KeyAMX = II;
|
||||
}
|
||||
}
|
||||
assert(PosEnd != BB->end() && "Not find TileStore!");
|
||||
|
||||
// See KeyAMX as TileStore if only TileLoad and TileStore.
|
||||
if (!KeyAMX)
|
||||
KeyAMX = dyn_cast<IntrinsicInst>(&*PosEnd);
|
||||
|
||||
// Get Shapes in order.
|
||||
assert(Shapes.empty() && "Shapes should be clean.");
|
||||
getKeyAMXShapes(KeyAMX, Shapes);
|
||||
|
||||
return PosEnd;
|
||||
}
|
||||
|
||||
// Record a key amx area's shapes with its position.
|
||||
// Use the first tileload as its position.
|
||||
// For example:
|
||||
// ...
|
||||
// --------------------------------------------------------------------------
|
||||
// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos
|
||||
// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) /
|
||||
// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes:
|
||||
// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n)
|
||||
// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n)
|
||||
// --------------------------------------------------------------------------
|
||||
bool X86PreAMXConfig::findConfigShapes(
|
||||
DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes) {
|
||||
bool Find = false;
|
||||
for (BasicBlock &BB : F) {
|
||||
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
|
||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I);
|
||||
if (!II)
|
||||
continue;
|
||||
if (!isAMXIntrinsic(II))
|
||||
continue;
|
||||
assert(onlyTileDef(II) && "Not volatile model for AMX at O0!");
|
||||
|
||||
I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]);
|
||||
Find = true;
|
||||
}
|
||||
}
|
||||
return Find;
|
||||
}
|
||||
|
||||
// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic.
|
||||
// e.g. (key amx = tdpbssd)
|
||||
// --------------------------------------------------------------------------
|
||||
// %cfgmem = alloca <16 x i32>, align 4 * allocate mem
|
||||
// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init
|
||||
// ...
|
||||
// ... pre-config shape of %t1 *
|
||||
// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 *
|
||||
// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config
|
||||
// ... *
|
||||
// ... pre-config shape of %t2 *
|
||||
// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes
|
||||
// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 *
|
||||
// ... *
|
||||
// ... pre-config shape of %t3 * of
|
||||
// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 *
|
||||
// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 *
|
||||
// ... * tiles
|
||||
// ... pre-config shape of %td *
|
||||
// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 *
|
||||
// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 *
|
||||
//
|
||||
// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config
|
||||
// --------------------------------------------------------------------------
|
||||
// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
|
||||
// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
|
||||
// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
|
||||
// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
|
||||
// call void @llvm.x86.tilestored64.internal(... td) area
|
||||
// --------------------------------------------------------------------------
|
||||
bool X86PreAMXConfig::preTileConfig() {
|
||||
DenseMap<Instruction *, SmallVector<Value *, 8>> PosAndShapes;
|
||||
bool NeedCfg = findConfigShapes(PosAndShapes);
|
||||
if (!NeedCfg)
|
||||
return false;
|
||||
for (auto &IPAndShapes : PosAndShapes)
|
||||
addTileConfig(IPAndShapes.first, IPAndShapes.second);
|
||||
|
||||
return true;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
namespace {
|
||||
|
||||
class X86PreAMXConfigPass : public FunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
X86PreAMXConfigPass() : FunctionPass(ID) {
|
||||
initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
|
||||
bool C = false;
|
||||
|
||||
// Prepare for fast register allocation at O0.
|
||||
if (TM->getOptLevel() == CodeGenOpt::None) {
|
||||
|
||||
// We pre-config each key AMX intrinsic at O0.
|
||||
// In theory, one tile config can cover several AMX intrinsics, but
|
||||
// it is very diffcult to classify the tile shapes at O0. So here we
|
||||
// let thing be easy, pre-config every key AMX intrinsic.
|
||||
X86PreAMXConfig PCFG(F);
|
||||
C = PCFG.preTileConfig();
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
AU.addRequired<TargetPassConfig>();
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
static const char PassName[] = "Pre AMX Tile Config";
|
||||
char X86PreAMXConfigPass::ID = 0;
|
||||
INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
||||
INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false)
|
||||
|
||||
FunctionPass *llvm::createX86PreAMXConfigPass() {
|
||||
return new X86PreAMXConfigPass();
|
||||
}
|
@ -64,7 +64,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
|
||||
PassRegistry &PR = *PassRegistry::getPassRegistry();
|
||||
initializeX86LowerAMXIntrinsicsLegacyPassPass(PR);
|
||||
initializeX86LowerAMXTypeLegacyPassPass(PR);
|
||||
initializeX86PreAMXConfigPassPass(PR);
|
||||
initializeGlobalISel(PR);
|
||||
initializeWinEHStatePassPass(PR);
|
||||
initializeFixupBWInstPassPass(PR);
|
||||
@ -75,7 +74,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
|
||||
initializeX86CallFrameOptimizationPass(PR);
|
||||
initializeX86CmovConverterPassPass(PR);
|
||||
initializeX86TileConfigPass(PR);
|
||||
initializeX86FastTileConfigPass(PR);
|
||||
initializeX86LowerTileCopyPass(PR);
|
||||
initializeX86ExpandPseudoPass(PR);
|
||||
initializeX86ExecutionDomainFixPass(PR);
|
||||
@ -379,7 +377,6 @@ public:
|
||||
bool addPreISel() override;
|
||||
void addMachineSSAOptimization() override;
|
||||
void addPreRegAlloc() override;
|
||||
bool addPostFastRegAllocRewrite() override;
|
||||
void addPostRegAlloc() override;
|
||||
void addPreEmitPass() override;
|
||||
void addPreEmitPass2() override;
|
||||
@ -419,9 +416,6 @@ void X86PassConfig::addIRPasses() {
|
||||
addPass(createX86LowerAMXIntrinsicsPass());
|
||||
addPass(createX86LowerAMXTypePass());
|
||||
|
||||
if (TM->getOptLevel() == CodeGenOpt::None)
|
||||
addPass(createX86PreAMXConfigPass());
|
||||
|
||||
TargetPassConfig::addIRPasses();
|
||||
|
||||
if (TM->getOptLevel() != CodeGenOpt::None) {
|
||||
@ -589,11 +583,6 @@ void X86PassConfig::addPreEmitPass2() {
|
||||
addPass(createX86LoadValueInjectionRetHardeningPass());
|
||||
}
|
||||
|
||||
bool X86PassConfig::addPostFastRegAllocRewrite() {
|
||||
addPass(createX86FastTileConfigPass());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool X86PassConfig::addPreRewrite() {
|
||||
addPass(createX86TileConfigPass());
|
||||
return true;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,78 +0,0 @@
|
||||
; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -lower-amx-type -S | FileCheck %s
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
|
||||
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0
|
||||
; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then
|
||||
; CHECK: if.then:
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: br label %if.end
|
||||
; CHECK: if.else:
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: br label %if.end
|
||||
; CHECK: if.end:
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: ret void
|
||||
|
||||
entry:
|
||||
%tobool.not = icmp eq i32 %cond, 0
|
||||
br i1 %tobool.not, label %if.else, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %entry
|
||||
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
%a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
|
||||
%b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
|
||||
%c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
|
||||
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
@ -1,210 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -pre-amx-config -S | FileCheck %s
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
|
||||
; CHECK-LABEL: entry:
|
||||
; CHECK: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0
|
||||
; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then
|
||||
|
||||
; CHECK: if.then:
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: br label %if.end
|
||||
|
||||
; CHECK: if.else:
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: br label %if.end
|
||||
; CHECK: if.end: ; preds = %if.else, %if.then
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: %amx.tmm.1.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 49
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 18
|
||||
; CHECK-NEXT: %amx.tmm.1.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.1.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 8, i16* %amx.tmm.1.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: %amx.tmm.2.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 50
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 20
|
||||
; CHECK-NEXT: %amx.tmm.2.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.2.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.2.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: %amx.tmm.3.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 51
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 22
|
||||
; CHECK-NEXT: %amx.tmm.3.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.3.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.3.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
|
||||
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
|
||||
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
|
||||
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
|
||||
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
|
||||
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
|
||||
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
|
||||
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
|
||||
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
|
||||
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
|
||||
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}})
|
||||
; CHECK-NEXT: ret void
|
||||
|
||||
entry:
|
||||
%0 = alloca <256 x i32>, align 1024
|
||||
%1 = bitcast <256 x i32>* %0 to i8*
|
||||
%2 = alloca <256 x i32>, align 1024
|
||||
%3 = bitcast <256 x i32>* %2 to i8*
|
||||
%4 = alloca <256 x i32>, align 1024
|
||||
%5 = bitcast <256 x i32>* %4 to i8*
|
||||
%6 = alloca <256 x i32>, align 1024
|
||||
%7 = bitcast <256 x i32>* %6 to i8*
|
||||
%tobool.not = icmp eq i32 %cond, 0
|
||||
br i1 %tobool.not, label %if.else, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %8)
|
||||
%9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %9)
|
||||
%10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %10)
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %entry
|
||||
%11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %11)
|
||||
%12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %12)
|
||||
%13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %13)
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
%14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %5, i64 64)
|
||||
%15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %3, i64 64)
|
||||
%16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %1, i64 64)
|
||||
%17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %16, x86_amx %14, x86_amx %15)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %7, i64 64, x86_amx %17)
|
||||
%18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %7, i64 64)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %18)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
@ -1,513 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
|
||||
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
|
||||
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
|
||||
; AVX512-LABEL: test_api:
|
||||
; AVX512: # %bb.0: # %entry
|
||||
; AVX512-NEXT: pushq %rbp
|
||||
; AVX512-NEXT: .cfi_def_cfa_offset 16
|
||||
; AVX512-NEXT: .cfi_offset %rbp, -16
|
||||
; AVX512-NEXT: movq %rsp, %rbp
|
||||
; AVX512-NEXT: .cfi_def_cfa_register %rbp
|
||||
; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
|
||||
; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800
|
||||
; AVX512-NEXT: movw %dx, %ax
|
||||
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
||||
; AVX512-NEXT: movw %si, %ax
|
||||
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX512-NEXT: cmpl $0, %edi
|
||||
; AVX512-NEXT: je .LBB0_2
|
||||
; AVX512-NEXT: # %bb.1: # %if.then
|
||||
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
|
||||
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, %sil
|
||||
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movl $buf, %r9d
|
||||
; AVX512-NEXT: movl $32, %r10d
|
||||
; AVX512-NEXT: movw $8, %si
|
||||
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX512-NEXT: movl $64, %r8d
|
||||
; AVX512-NEXT: tilestored %tmm0, (%r11,%r8)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, %dil
|
||||
; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg (%rsi)
|
||||
; AVX512-NEXT: movl $buf, %esi
|
||||
; AVX512-NEXT: movl $32, %edi
|
||||
; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; AVX512-NEXT: movl $64, %esi
|
||||
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX512-NEXT: jmp .LBB0_3
|
||||
; AVX512-NEXT: .LBB0_2: # %if.else
|
||||
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
|
||||
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, %sil
|
||||
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movl $buf2, %r9d
|
||||
; AVX512-NEXT: movl $32, %r10d
|
||||
; AVX512-NEXT: movw $8, %si
|
||||
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX512-NEXT: movl $64, %r8d
|
||||
; AVX512-NEXT: tilestored %tmm0, (%r11,%r8)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, %dil
|
||||
; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg (%rsi)
|
||||
; AVX512-NEXT: movl $buf2, %esi
|
||||
; AVX512-NEXT: movl $32, %edi
|
||||
; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; AVX512-NEXT: movl $64, %esi
|
||||
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX512-NEXT: .LBB0_3: # %if.end
|
||||
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
|
||||
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
|
||||
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, %sil
|
||||
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movl $64, %esi
|
||||
; AVX512-NEXT: movw $8, %di
|
||||
; AVX512-NEXT: tileloadd (%r10,%rsi), %tmm1
|
||||
; AVX512-NEXT: tileloadd (%r9,%rsi), %tmm2
|
||||
; AVX512-NEXT: tileloadd (%r8,%rsi), %tmm0
|
||||
; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, %dil
|
||||
; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: ldtilecfg (%rsi)
|
||||
; AVX512-NEXT: movl $64, %esi
|
||||
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
||||
; AVX512-NEXT: movl $buf, %edx
|
||||
; AVX512-NEXT: movl $32, %esi
|
||||
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX512-NEXT: movq %rbp, %rsp
|
||||
; AVX512-NEXT: popq %rbp
|
||||
; AVX512-NEXT: .cfi_def_cfa %rsp, 8
|
||||
; AVX512-NEXT: tilerelease
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_api:
|
||||
; AVX2: # %bb.0: # %entry
|
||||
; AVX2-NEXT: pushq %rbp
|
||||
; AVX2-NEXT: .cfi_def_cfa_offset 16
|
||||
; AVX2-NEXT: .cfi_offset %rbp, -16
|
||||
; AVX2-NEXT: movq %rsp, %rbp
|
||||
; AVX2-NEXT: .cfi_def_cfa_register %rbp
|
||||
; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00
|
||||
; AVX2-NEXT: subq $6144, %rsp # imm = 0x1800
|
||||
; AVX2-NEXT: movw %dx, %ax
|
||||
; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
||||
; AVX2-NEXT: movw %si, %ax
|
||||
; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; AVX2-NEXT: cmpl $0, %edi
|
||||
; AVX2-NEXT: je .LBB0_2
|
||||
; AVX2-NEXT: # %bb.1: # %if.then
|
||||
; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
|
||||
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, %sil
|
||||
; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movl $buf, %r9d
|
||||
; AVX2-NEXT: movl $32, %r10d
|
||||
; AVX2-NEXT: movw $8, %si
|
||||
; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX2-NEXT: movl $64, %r8d
|
||||
; AVX2-NEXT: tilestored %tmm0, (%r11,%r8)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, %dil
|
||||
; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg (%rsi)
|
||||
; AVX2-NEXT: movl $buf, %esi
|
||||
; AVX2-NEXT: movl $32, %edi
|
||||
; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; AVX2-NEXT: movl $64, %esi
|
||||
; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX2-NEXT: jmp .LBB0_3
|
||||
; AVX2-NEXT: .LBB0_2: # %if.else
|
||||
; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
|
||||
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, %sil
|
||||
; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movl $buf2, %r9d
|
||||
; AVX2-NEXT: movl $32, %r10d
|
||||
; AVX2-NEXT: movw $8, %si
|
||||
; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX2-NEXT: movl $64, %r8d
|
||||
; AVX2-NEXT: tilestored %tmm0, (%r11,%r8)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, %dil
|
||||
; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg (%rsi)
|
||||
; AVX2-NEXT: movl $buf2, %esi
|
||||
; AVX2-NEXT: movl $32, %edi
|
||||
; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; AVX2-NEXT: movl $64, %esi
|
||||
; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX2-NEXT: .LBB0_3: # %if.end
|
||||
; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
|
||||
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
|
||||
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, %sil
|
||||
; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movl $64, %esi
|
||||
; AVX2-NEXT: movw $8, %di
|
||||
; AVX2-NEXT: tileloadd (%r10,%rsi), %tmm1
|
||||
; AVX2-NEXT: tileloadd (%r9,%rsi), %tmm2
|
||||
; AVX2-NEXT: tileloadd (%r8,%rsi), %tmm0
|
||||
; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||
; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, %dil
|
||||
; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: ldtilecfg (%rsi)
|
||||
; AVX2-NEXT: movl $64, %esi
|
||||
; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
||||
; AVX2-NEXT: movl $buf, %edx
|
||||
; AVX2-NEXT: movl $32, %esi
|
||||
; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; AVX2-NEXT: movq %rbp, %rsp
|
||||
; AVX2-NEXT: popq %rbp
|
||||
; AVX2-NEXT: .cfi_def_cfa %rsp, 8
|
||||
; AVX2-NEXT: tilerelease
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; SSE2-LABEL: test_api:
|
||||
; SSE2: # %bb.0: # %entry
|
||||
; SSE2-NEXT: pushq %rbp
|
||||
; SSE2-NEXT: .cfi_def_cfa_offset 16
|
||||
; SSE2-NEXT: .cfi_offset %rbp, -16
|
||||
; SSE2-NEXT: movq %rsp, %rbp
|
||||
; SSE2-NEXT: .cfi_def_cfa_register %rbp
|
||||
; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00
|
||||
; SSE2-NEXT: subq $6144, %rsp # imm = 0x1800
|
||||
; SSE2-NEXT: movw %dx, %ax
|
||||
; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
||||
; SSE2-NEXT: movw %si, %ax
|
||||
; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
||||
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; SSE2-NEXT: cmpl $0, %edi
|
||||
; SSE2-NEXT: je .LBB0_2
|
||||
; SSE2-NEXT: # %bb.1: # %if.then
|
||||
; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
|
||||
; SSE2-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, %sil
|
||||
; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movl $buf, %r9d
|
||||
; SSE2-NEXT: movl $32, %r10d
|
||||
; SSE2-NEXT: movw $8, %si
|
||||
; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; SSE2-NEXT: movl $64, %r8d
|
||||
; SSE2-NEXT: tilestored %tmm0, (%r11,%r8)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, %dil
|
||||
; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg (%rsi)
|
||||
; SSE2-NEXT: movl $buf, %esi
|
||||
; SSE2-NEXT: movl $32, %edi
|
||||
; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; SSE2-NEXT: movl $64, %esi
|
||||
; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; SSE2-NEXT: jmp .LBB0_3
|
||||
; SSE2-NEXT: .LBB0_2: # %if.else
|
||||
; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
|
||||
; SSE2-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, %sil
|
||||
; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movl $buf2, %r9d
|
||||
; SSE2-NEXT: movl $32, %r10d
|
||||
; SSE2-NEXT: movw $8, %si
|
||||
; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; SSE2-NEXT: movl $64, %r8d
|
||||
; SSE2-NEXT: tilestored %tmm0, (%r11,%r8)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
|
||||
; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, %dil
|
||||
; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg (%rsi)
|
||||
; SSE2-NEXT: movl $buf2, %esi
|
||||
; SSE2-NEXT: movl $32, %edi
|
||||
; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; SSE2-NEXT: movl $64, %esi
|
||||
; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; SSE2-NEXT: .LBB0_3: # %if.end
|
||||
; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
||||
; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
|
||||
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
|
||||
; SSE2-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, %sil
|
||||
; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movl $64, %esi
|
||||
; SSE2-NEXT: movw $8, %di
|
||||
; SSE2-NEXT: tileloadd (%r10,%rsi), %tmm1
|
||||
; SSE2-NEXT: tileloadd (%r9,%rsi), %tmm2
|
||||
; SSE2-NEXT: tileloadd (%r8,%rsi), %tmm0
|
||||
; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||
; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, %dil
|
||||
; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: ldtilecfg (%rsi)
|
||||
; SSE2-NEXT: movl $64, %esi
|
||||
; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
||||
; SSE2-NEXT: movl $buf, %edx
|
||||
; SSE2-NEXT: movl $32, %esi
|
||||
; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; SSE2-NEXT: movq %rbp, %rsp
|
||||
; SSE2-NEXT: popq %rbp
|
||||
; SSE2-NEXT: .cfi_def_cfa %rsp, 8
|
||||
; SSE2-NEXT: tilerelease
|
||||
; SSE2-NEXT: retq
|
||||
entry:
|
||||
%tobool.not = icmp eq i32 %cond, 0
|
||||
br i1 %tobool.not, label %if.else, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %entry
|
||||
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
%a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
|
||||
%b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
|
||||
%c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
|
||||
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
@ -1,465 +0,0 @@
|
||||
# RUN: llc -o - -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -run-pass=fasttileconfig %s | FileCheck %s
|
||||
|
||||
--- |
|
||||
|
||||
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr #0 {
|
||||
entry:
|
||||
%0 = alloca <16 x i32>, align 4
|
||||
%1 = alloca <16 x i32>, align 4
|
||||
%2 = alloca <16 x i32>, align 4
|
||||
%3 = alloca <16 x i32>, align 4
|
||||
%4 = alloca <16 x i32>, align 4
|
||||
%5 = alloca <16 x i32>, align 4
|
||||
%6 = alloca <16 x i32>, align 4
|
||||
%7 = alloca <16 x i32>, align 4
|
||||
%8 = alloca <256 x i32>, align 1024
|
||||
%9 = bitcast <256 x i32>* %8 to i8*
|
||||
%10 = alloca <256 x i32>, align 1024
|
||||
%11 = bitcast <256 x i32>* %10 to i8*
|
||||
%12 = alloca <256 x i32>, align 1024
|
||||
%13 = bitcast <256 x i32>* %12 to i8*
|
||||
%14 = alloca <256 x i32>, align 1024
|
||||
%15 = bitcast <256 x i32>* %14 to i8*
|
||||
%tobool.not = icmp eq i32 %cond, 0
|
||||
br i1 %tobool.not, label %if.else, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%16 = bitcast <16 x i32>* %6 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %6, align 64
|
||||
%amx.tmm.0.shape.row1 = getelementptr i8, i8* %16, i64 48
|
||||
%17 = getelementptr i8, i8* %16, i64 16
|
||||
%amx.tmm.0.shape.col2 = bitcast i8* %17 to i16*
|
||||
%18 = trunc i16 %row to i8
|
||||
store volatile i8 %18, i8* %amx.tmm.0.shape.row1, align 1
|
||||
store volatile i16 8, i16* %amx.tmm.0.shape.col2, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %16)
|
||||
%19 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %19)
|
||||
%20 = bitcast <16 x i32>* %2 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %2, align 64
|
||||
%amx.tmm.0.shape.row9 = getelementptr i8, i8* %20, i64 48
|
||||
%21 = getelementptr i8, i8* %20, i64 16
|
||||
%amx.tmm.0.shape.col10 = bitcast i8* %21 to i16*
|
||||
%22 = trunc i16 8 to i8
|
||||
store volatile i8 %22, i8* %amx.tmm.0.shape.row9, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.0.shape.col10, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %20)
|
||||
%23 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %23)
|
||||
%24 = bitcast <16 x i32>* %3 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %3, align 64
|
||||
%amx.tmm.0.shape.row7 = getelementptr i8, i8* %24, i64 48
|
||||
%25 = getelementptr i8, i8* %24, i64 16
|
||||
%amx.tmm.0.shape.col8 = bitcast i8* %25 to i16*
|
||||
%26 = trunc i16 %row to i8
|
||||
store volatile i8 %26, i8* %amx.tmm.0.shape.row7, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.0.shape.col8, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %24)
|
||||
%27 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %27)
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %entry
|
||||
%28 = bitcast <16 x i32>* %1 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %1, align 64
|
||||
%amx.tmm.0.shape.row11 = getelementptr i8, i8* %28, i64 48
|
||||
%29 = getelementptr i8, i8* %28, i64 16
|
||||
%amx.tmm.0.shape.col12 = bitcast i8* %29 to i16*
|
||||
%30 = trunc i16 %row to i8
|
||||
store volatile i8 %30, i8* %amx.tmm.0.shape.row11, align 1
|
||||
store volatile i16 8, i16* %amx.tmm.0.shape.col12, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %28)
|
||||
%31 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %31)
|
||||
%32 = bitcast <16 x i32>* %7 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %7, align 64
|
||||
%amx.tmm.0.shape.row = getelementptr i8, i8* %32, i64 48
|
||||
%33 = getelementptr i8, i8* %32, i64 16
|
||||
%amx.tmm.0.shape.col = bitcast i8* %33 to i16*
|
||||
%34 = trunc i16 8 to i8
|
||||
store volatile i8 %34, i8* %amx.tmm.0.shape.row, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.0.shape.col, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %32)
|
||||
%35 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %35)
|
||||
%36 = bitcast <16 x i32>* %0 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %0, align 64
|
||||
%amx.tmm.0.shape.row13 = getelementptr i8, i8* %36, i64 48
|
||||
%37 = getelementptr i8, i8* %36, i64 16
|
||||
%amx.tmm.0.shape.col14 = bitcast i8* %37 to i16*
|
||||
%38 = trunc i16 %row to i8
|
||||
store volatile i8 %38, i8* %amx.tmm.0.shape.row13, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.0.shape.col14, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %36)
|
||||
%39 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %39)
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
%40 = bitcast <16 x i32>* %4 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %4, align 64
|
||||
%amx.tmm.0.shape.row5 = getelementptr i8, i8* %40, i64 48
|
||||
%41 = getelementptr i8, i8* %40, i64 16
|
||||
%amx.tmm.0.shape.col6 = bitcast i8* %41 to i16*
|
||||
%42 = trunc i16 %row to i8
|
||||
store volatile i8 %42, i8* %amx.tmm.0.shape.row5, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.0.shape.col6, align 2
|
||||
%amx.tmm.1.shape.row = getelementptr i8, i8* %40, i64 49
|
||||
%43 = getelementptr i8, i8* %40, i64 18
|
||||
%amx.tmm.1.shape.col = bitcast i8* %43 to i16*
|
||||
%44 = trunc i16 %row to i8
|
||||
store volatile i8 %44, i8* %amx.tmm.1.shape.row, align 1
|
||||
store volatile i16 8, i16* %amx.tmm.1.shape.col, align 2
|
||||
%amx.tmm.2.shape.row = getelementptr i8, i8* %40, i64 50
|
||||
%45 = getelementptr i8, i8* %40, i64 20
|
||||
%amx.tmm.2.shape.col = bitcast i8* %45 to i16*
|
||||
%46 = trunc i16 8 to i8
|
||||
store volatile i8 %46, i8* %amx.tmm.2.shape.row, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.2.shape.col, align 2
|
||||
%amx.tmm.3.shape.row = getelementptr i8, i8* %40, i64 51
|
||||
%47 = getelementptr i8, i8* %40, i64 22
|
||||
%amx.tmm.3.shape.col = bitcast i8* %47 to i16*
|
||||
%48 = trunc i16 %row to i8
|
||||
store volatile i8 %48, i8* %amx.tmm.3.shape.row, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.3.shape.col, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %40)
|
||||
%49 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %13, i64 64)
|
||||
%50 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %11, i64 64)
|
||||
%51 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %9, i64 64)
|
||||
%52 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %51, x86_amx %49, x86_amx %50)
|
||||
call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %15, i64 64, x86_amx %52)
|
||||
%53 = bitcast <16 x i32>* %5 to i8*
|
||||
store <16 x i32> zeroinitializer, <16 x i32>* %5, align 64
|
||||
%amx.tmm.0.shape.row3 = getelementptr i8, i8* %53, i64 48
|
||||
%54 = getelementptr i8, i8* %53, i64 16
|
||||
%amx.tmm.0.shape.col4 = bitcast i8* %54 to i16*
|
||||
%55 = trunc i16 %row to i8
|
||||
store volatile i8 %55, i8* %amx.tmm.0.shape.row3, align 1
|
||||
store volatile i16 %col, i16* %amx.tmm.0.shape.col4, align 2
|
||||
call void @llvm.x86.ldtilecfg(i8* %53)
|
||||
%56 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %15, i64 64)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %56)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #1
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #1
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.x86.ldtilecfg(i8*) #2
|
||||
|
||||
attributes #0 = { "target-features"="+amx-int8,+avx512f" }
|
||||
attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" }
|
||||
attributes #2 = { nounwind }
|
||||
|
||||
...
|
||||
---
|
||||
name: test_api
|
||||
alignment: 16
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
registers: []
|
||||
liveins:
|
||||
- { reg: '$edi', virtual-reg: '' }
|
||||
- { reg: '$esi', virtual-reg: '' }
|
||||
- { reg: '$edx', virtual-reg: '' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 1024
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
stackProtector: ''
|
||||
maxCallFrameSize: 4294967295
|
||||
cvBytesOfCalleeSavedRegisters: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
hasTailCall: false
|
||||
localFrameSize: 0
|
||||
savePoint: ''
|
||||
restorePoint: ''
|
||||
fixedStack: []
|
||||
stack:
|
||||
- { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 1, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 2, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 3, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 5, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 6, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 7, name: '', type: default, offset: 0, size: 64, alignment: 16,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 8, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 9, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 10, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 11, name: '', type: default, offset: 0, size: 1024, alignment: 1024,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 12, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 13, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 14, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 15, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 16, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 17, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
callSites: []
|
||||
debugValueSubstitutions: []
|
||||
constants: []
|
||||
machineFunctionInfo: {}
|
||||
body: |
|
||||
bb.0.entry:
|
||||
successors: %bb.2(0x40000000), %bb.1(0x40000000)
|
||||
liveins: $edi, $esi, $edx
|
||||
|
||||
renamable $ax = COPY renamable $dx, implicit killed $edx
|
||||
MOV16mr %stack.17, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.17)
|
||||
renamable $ax = COPY renamable $si, implicit killed $esi
|
||||
MOV16mr %stack.16, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.16)
|
||||
renamable $rax = LEA64r %stack.8, 1, $noreg, 0, $noreg
|
||||
MOV64mr %stack.15, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.15)
|
||||
renamable $rax = LEA64r %stack.9, 1, $noreg, 0, $noreg
|
||||
MOV64mr %stack.14, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.14)
|
||||
renamable $rax = LEA64r %stack.10, 1, $noreg, 0, $noreg
|
||||
MOV64mr %stack.13, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.13)
|
||||
renamable $rax = LEA64r %stack.11, 1, $noreg, 0, $noreg
|
||||
MOV64mr %stack.12, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.12)
|
||||
CMP32ri8 killed renamable $edi, 0, implicit-def $eflags
|
||||
JCC_1 %bb.2, 4, implicit killed $eflags
|
||||
|
||||
bb.1.if.then:
|
||||
successors: %bb.3(0x80000000)
|
||||
; CHECK-LABEL: bb.1.if.then
|
||||
; tmm0 --> row_offset = 48, col_offset = 16
|
||||
; CHECK: MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1)
|
||||
; CHECK: MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2)
|
||||
; CHECK: LDTILECFG %stack.6, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
|
||||
; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
|
||||
|
||||
; tmm1 --> row_offset = 49, col_offset = 18
|
||||
; CHECK: MOV8mi %stack.2, 1, $noreg, 49, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9)
|
||||
; CHECK: MOV16mr %stack.2, 1, $noreg, 18, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10)
|
||||
; CHECK: LDTILECFG %stack.2, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
|
||||
; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1
|
||||
|
||||
; tmm2 --> row_offset = 50, col_offset = 20
|
||||
; CHECK: MOV8mr %stack.3, 1, $noreg, 50, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7)
|
||||
; CHECK: MOV16mr %stack.3, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8)
|
||||
; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
|
||||
; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2
|
||||
|
||||
$ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16)
|
||||
$cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17)
|
||||
$rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15)
|
||||
$rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14)
|
||||
$r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13)
|
||||
renamable $zmm0 = AVX512_512_SET0
|
||||
VMOVDQA64Zmr %stack.6, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.6)
|
||||
renamable $sil = COPY renamable $al
|
||||
MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1)
|
||||
MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2)
|
||||
LDTILECFG %stack.6, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $r9 = MOV32ri64 @buf
|
||||
renamable $r10 = MOV32ri64 32
|
||||
renamable $si = MOV16ri 8
|
||||
renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
|
||||
renamable $r8 = MOV32ri64 64
|
||||
PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
|
||||
VMOVDQA64Zmr %stack.2, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.2)
|
||||
MOV8mi %stack.2, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9)
|
||||
MOV16mr %stack.2, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10)
|
||||
LDTILECFG %stack.2, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
|
||||
PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1
|
||||
renamable $rsi = LEA64r %stack.3, 1, $noreg, 0, $noreg
|
||||
VMOVDQA64Zmr %stack.3, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.3)
|
||||
renamable $dil = COPY renamable $al
|
||||
MOV8mr %stack.3, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7)
|
||||
MOV16mr %stack.3, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8)
|
||||
LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $rsi = MOV32ri64 @buf
|
||||
renamable $rdi = MOV32ri64 32
|
||||
renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
|
||||
renamable $rsi = MOV32ri64 64
|
||||
PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2
|
||||
JMP_1 %bb.3
|
||||
|
||||
bb.2.if.else:
|
||||
successors: %bb.3(0x80000000)
|
||||
|
||||
; CHECK-LABEL: bb.2.if.else
|
||||
; tmm3 --> row_offset = 51, col_offset = 22
|
||||
; CHECK: MOV8mr %stack.1, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11)
|
||||
; CHECK: MOV16mi %stack.1, 1, $noreg, 22, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12)
|
||||
; CHECK: LDTILECFG %stack.1, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
|
||||
; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3
|
||||
|
||||
; tmm4 --> row_offset = 52, col_offset = 24
|
||||
; CHECK: MOV8mi %stack.7, 1, $noreg, 52, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row)
|
||||
; CHECK: MOV16mr %stack.7, 1, $noreg, 24, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col)
|
||||
; CHECK: LDTILECFG %stack.7, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
|
||||
; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4
|
||||
|
||||
; tmm4 --> row_offset = 53, col_offset = 26
|
||||
; CHECK: MOV8mr %stack.0, 1, $noreg, 53, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13)
|
||||
; CHECK: MOV16mr %stack.0, 1, $noreg, 26, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14)
|
||||
; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
|
||||
; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5
|
||||
|
||||
$ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16)
|
||||
$cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17)
|
||||
$rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15)
|
||||
$rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14)
|
||||
$r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13)
|
||||
renamable $zmm0 = AVX512_512_SET0
|
||||
VMOVDQA64Zmr %stack.1, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.1)
|
||||
renamable $sil = COPY renamable $al
|
||||
MOV8mr %stack.1, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11)
|
||||
MOV16mi %stack.1, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12)
|
||||
LDTILECFG %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $r9 = MOV32ri64 @buf2
|
||||
renamable $r10 = MOV32ri64 32
|
||||
renamable $si = MOV16ri 8
|
||||
renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg
|
||||
renamable $r8 = MOV32ri64 64
|
||||
PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3
|
||||
VMOVDQA64Zmr %stack.7, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.7)
|
||||
MOV8mi %stack.7, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row)
|
||||
MOV16mr %stack.7, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col)
|
||||
LDTILECFG %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg
|
||||
PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4
|
||||
renamable $rsi = LEA64r %stack.0, 1, $noreg, 0, $noreg
|
||||
VMOVDQA64Zmr %stack.0, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.0)
|
||||
renamable $dil = COPY renamable $al
|
||||
MOV8mr %stack.0, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13)
|
||||
MOV16mr %stack.0, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14)
|
||||
LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $rsi = MOV32ri64 @buf2
|
||||
renamable $rdi = MOV32ri64 32
|
||||
renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg
|
||||
renamable $rsi = MOV32ri64 64
|
||||
PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5
|
||||
|
||||
bb.3.if.end:
|
||||
; CHECK-LABEL: bb.3.if.end
|
||||
; tmm0 --> row_offset = 48, col_offset = 16
|
||||
; tmm1 --> row_offset = 49, col_offset = 18
|
||||
; tmm2 --> row_offset = 50, col_offset = 20
|
||||
; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5)
|
||||
; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6)
|
||||
; CHECK: MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row)
|
||||
; CHECK: MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col)
|
||||
; CHECK: MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row)
|
||||
; CHECK: MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col)
|
||||
; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row)
|
||||
; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col)
|
||||
; CHECK: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0
|
||||
; CHECK: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg
|
||||
; CHECK: renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
|
||||
; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg
|
||||
; CHECK: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
|
||||
; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
|
||||
; tmm6 --> row_offset = 54, col_offset = 28
|
||||
; CHECK: MOV8mr %stack.5, 1, $noreg, 54, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3)
|
||||
; CHECK: MOV16mr %stack.5, 1, $noreg, 28, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4)
|
||||
; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg
|
||||
; CHECK: renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
|
||||
; CHECK: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6
|
||||
|
||||
$ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16)
|
||||
$cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17)
|
||||
$rdx = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load 8 from %stack.12)
|
||||
$r8 = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15)
|
||||
$r9 = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14)
|
||||
$r10 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13)
|
||||
renamable $zmm0 = AVX512_512_SET0
|
||||
VMOVDQA64Zmr %stack.4, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.4)
|
||||
renamable $sil = COPY renamable $al
|
||||
MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5)
|
||||
MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6)
|
||||
MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row)
|
||||
MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col)
|
||||
MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row)
|
||||
MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col)
|
||||
MOV8mr %stack.4, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row)
|
||||
MOV16mr %stack.4, 1, $noreg, 22, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col)
|
||||
LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $rsi = MOV32ri64 64
|
||||
renamable $di = MOV16ri 8
|
||||
renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg
|
||||
renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
|
||||
renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg
|
||||
renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
|
||||
PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
renamable $rsi = LEA64r %stack.5, 1, $noreg, 0, $noreg
|
||||
VMOVDQA64Zmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.5)
|
||||
renamable $dil = COPY renamable $al
|
||||
MOV8mr %stack.5, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3)
|
||||
MOV16mr %stack.5, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4)
|
||||
LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7
|
||||
renamable $rsi = MOV32ri64 64
|
||||
renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
|
||||
renamable $rdx = MOV32ri64 @buf
|
||||
renamable $rsi = MOV32ri64 32
|
||||
PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6
|
||||
RETQ
|
||||
|
||||
...
|
@ -1,5 +1,5 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
|
||||
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
|
||||
|
||||
define dso_local void @test_no_bitcast(i32* %A_mem, i32* %B_mem, i32* %C_mem) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: @test_no_bitcast(
|
||||
|
@ -1,5 +1,5 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
|
||||
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s
|
||||
|
||||
define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) {
|
||||
; CHECK-LABEL: @test_amx_load_non_O0(
|
||||
|
@ -20,7 +20,6 @@
|
||||
; CHECK-NEXT: Expand Atomic instructions
|
||||
; CHECK-NEXT: Lower AMX intrinsics
|
||||
; CHECK-NEXT: Lower AMX type for load/store
|
||||
; CHECK-NEXT: Pre AMX Tile Config
|
||||
; CHECK-NEXT: Module Verifier
|
||||
; CHECK-NEXT: Lower Garbage Collection Instructions
|
||||
; CHECK-NEXT: Shadow Stack GC Lowering
|
||||
@ -46,7 +45,6 @@
|
||||
; CHECK-NEXT: Eliminate PHI nodes for register allocation
|
||||
; CHECK-NEXT: Two-Address instruction pass
|
||||
; CHECK-NEXT: Fast Register Allocator
|
||||
; CHECK-NEXT: Fast Tile Register Configure
|
||||
; CHECK-NEXT: X86 Lower Tile Copy
|
||||
; CHECK-NEXT: Bundle Machine CFG Edges
|
||||
; CHECK-NEXT: X86 FP Stackifier
|
||||
|
@ -509,29 +509,19 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
|
||||
"mips-", "lanai-", "hexagon-", "bpf-", "avr-", "thumb2-", "arm-",
|
||||
"si-", "gcn-", "amdgpu-", "aarch64-", "amdgcn-", "polly-"};
|
||||
std::vector<StringRef> PassNameContain = {"ehprepare"};
|
||||
std::vector<StringRef> PassNameExact = {"safe-stack",
|
||||
"cost-model",
|
||||
"codegenprepare",
|
||||
"interleaved-load-combine",
|
||||
"unreachableblockelim",
|
||||
"verify-safepoint-ir",
|
||||
"atomic-expand",
|
||||
"hardware-loops",
|
||||
"type-promotion",
|
||||
"mve-tail-predication",
|
||||
"interleaved-access",
|
||||
"global-merge",
|
||||
"pre-isel-intrinsic-lowering",
|
||||
"expand-reductions",
|
||||
"indirectbr-expand",
|
||||
"generic-to-nvvm",
|
||||
"expandmemcmp",
|
||||
"loop-reduce",
|
||||
"lower-amx-type",
|
||||
"pre-amx-config",
|
||||
"lower-amx-intrinsics",
|
||||
"polyhedral-info",
|
||||
"replace-with-veclib"};
|
||||
std::vector<StringRef> PassNameExact = {
|
||||
"safe-stack", "cost-model",
|
||||
"codegenprepare", "interleaved-load-combine",
|
||||
"unreachableblockelim", "verify-safepoint-ir",
|
||||
"atomic-expand",
|
||||
"hardware-loops", "type-promotion",
|
||||
"mve-tail-predication", "interleaved-access",
|
||||
"global-merge", "pre-isel-intrinsic-lowering",
|
||||
"expand-reductions", "indirectbr-expand",
|
||||
"generic-to-nvvm", "expandmemcmp",
|
||||
"loop-reduce", "lower-amx-type",
|
||||
"lower-amx-intrinsics", "polyhedral-info",
|
||||
"replace-with-veclib"};
|
||||
for (const auto &P : PassNamePrefix)
|
||||
if (Pass.startswith(P))
|
||||
return true;
|
||||
|
@ -87,7 +87,6 @@ static_library("LLVMX86CodeGen") {
|
||||
"X86EvexToVex.cpp",
|
||||
"X86ExpandPseudo.cpp",
|
||||
"X86FastISel.cpp",
|
||||
"X86FastTileConfig.cpp",
|
||||
"X86FixupBWInsts.cpp",
|
||||
"X86FixupLEAs.cpp",
|
||||
"X86FixupSetCC.cpp",
|
||||
|
Loading…
Reference in New Issue
Block a user