1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

[X86][AMX] Lower tile copy instruction.

Since there is no tile copy instruction, we need to store tile
register to stack and load from stack to another tile register.
We need extra GR to hold the stride, and we need stack slot to
hold the tile data register. We would run this pass after copy
propagation, so that we don't miss copy optimization. And we
would run this pass before prolog/epilog insertion, so that we
can allocate stack slot.

Differential Revision: https://reviews.llvm.org/D97112
This commit is contained in:
Luo, Yuanke 2021-02-20 15:05:07 +08:00
parent 771be1b79d
commit ede5e5d465
8 changed files with 330 additions and 0 deletions

View File

@ -32,6 +32,7 @@ set(sources
X86CmovConversion.cpp
X86DomainReassignment.cpp
X86DiscriminateMemOps.cpp
X86LowerTileCopy.cpp
X86LowerAMXType.cpp
X86TileConfig.cpp
X86PreTileConfig.cpp

View File

@ -76,10 +76,15 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
/// Return a pass that config the tile registers.
FunctionPass *createX86TileConfigPass();
/// Return a pass that insert pseudo tile config instruction.
FunctionPass *createX86PreTileConfigPass();
/// Return a pass that lower the tile copy instruction.
FunctionPass *createX86LowerTileCopyPass();
/// Return a pass that inserts int3 at the end of the function if it ends with a
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
@ -169,6 +174,7 @@ void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
void initializeX86TileConfigPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
void initializeX86LowerTileCopyPass(PassRegistry &);
namespace X86AS {
enum : unsigned {

View File

@ -0,0 +1,132 @@
//===-- X86LowerTileCopy.cpp - Expand Tile Copy Instructions---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the pass which lower AMX tile copy instructions. Since
// there is no tile copy instruction, we need store tile register to stack
// and load from stack to another tile register. We need extra GR to hold
// the stride, and we need stack slot to hold the tile data register.
// We would run this pass after copy propagation, so that we don't miss copy
// optimization. And we would run this pass before prolog/epilog insertion,
// so that we can allocate stack slot.
//
//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "x86-lower-tile-copy"
namespace {
class X86LowerTileCopy : public MachineFunctionPass {
public:
static char ID;
X86LowerTileCopy() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "X86 Lower Tile Copy"; }
};
} // namespace
char X86LowerTileCopy::ID = 0;
INITIALIZE_PASS_BEGIN(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
false, false)
INITIALIZE_PASS_END(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
false, false)
void X86LowerTileCopy::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
FunctionPass *llvm::createX86LowerTileCopyPass() {
return new X86LowerTileCopy();
}
bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
const X86InstrInfo *TII = ST.getInstrInfo();
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
MII != MIE;) {
MachineInstr &MI = *MII++;
if (!MI.isCopy())
continue;
MachineOperand &DstMO = MI.getOperand(0);
MachineOperand &SrcMO = MI.getOperand(1);
Register SrcReg = SrcMO.getReg();
Register DstReg = DstMO.getReg();
if (!X86::TILERegClass.contains(DstReg, SrcReg))
continue;
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
// Allocate stack slot for tile register
unsigned Size = TRI->getSpillSize(X86::TILERegClass);
Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
// Allocate stack slot for stride register
Size = TRI->getSpillSize(X86::GR64RegClass);
Alignment = TRI->getSpillAlign(X86::GR64RegClass);
int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
// TODO: Pick a killed regiter to avoid save/reload. There is problem
// to get live interval in this stage.
Register GR64Cand = X86::RAX;
const DebugLoc &DL = MI.getDebugLoc();
// mov %rax (%sp)
BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
.addReg(GR64Cand);
// mov 64 %rax
BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
// tilestored %tmm, (%sp, %idx)
unsigned Opc = X86::TILESTORED;
MachineInstr *NewMI =
addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
.addReg(SrcReg, getKillRegState(SrcMO.isKill()));
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(GR64Cand);
MO.setIsKill(true);
// tileloadd (%sp, %idx), %tmm
Opc = X86::TILELOADD;
NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
TileSS);
// restore %rax
// mov (%sp) %rax
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
StrideSS);
MI.eraseFromParent();
Changed = true;
}
}
return Changed;
}

View File

@ -875,6 +875,12 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
default:
llvm_unreachable("Unexpected machine instruction on tile register!");
break;
case X86::COPY: {
Register SrcReg = MI->getOperand(1).getReg();
ShapeT Shape = getTileShape(SrcReg, VRM, MRI);
VRM->assignVirt2Shape(VirtReg, Shape);
return Shape;
}
// We only collect the tile shape that is defined.
case X86::PTILELOADDV:
case X86::PTDPBSSDV:

View File

@ -73,6 +73,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
initializeX86TileConfigPass(PR);
initializeX86LowerTileCopyPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
@ -508,6 +509,7 @@ void X86PassConfig::addMachineSSAOptimization() {
}
void X86PassConfig::addPostRegAlloc() {
addPass(createX86LowerTileCopyPass());
addPass(createX86FloatingPointStackifierPass());
// When -O0 is enabled, the Load Value Injection Hardening pass will fall back
// to using the Speculative Execution Side Effect Suppression pass for

View File

@ -0,0 +1,181 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
define dso_local void @test1(i8 *%buf) nounwind {
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $64, %eax
; CHECK-NEXT: movw $8, %r14w
; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm3
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
; CHECK-NEXT: # implicit-def: $rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
; CHECK-NEXT: incl %ebp
; CHECK-NEXT: cmpw $100, %bp
; CHECK-NEXT: jl .LBB0_2
; CHECK-NEXT: .LBB0_3: # %exit
; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 64)
br i1 undef, label %loop.header, label %exit
loop.header:
%ivphi = phi i16 [0, %entry], [%iv, %loop.latch]
call void @foo()
br label %loop.body
loop.body:
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4)
br label %loop.latch
loop.latch:
%iv = add i16 %ivphi, 1
%c = icmp slt i16 %iv, 100
br i1 %c, label %loop.header, label %exit
exit:
ret void
}
define dso_local void @test2(i8 *%buf) nounwind {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, %r14w
; CHECK-NEXT: tilezero %tmm3
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
; CHECK-NEXT: # implicit-def: $rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
; CHECK-NEXT: incl %ebp
; CHECK-NEXT: cmpw $100, %bp
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
br i1 undef, label %loop.header, label %exit
loop.header:
%ivphi = phi i16 [0, %entry], [%iv, %loop.latch]
call void @foo()
br label %loop.body
loop.body:
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4)
br label %loop.latch
loop.latch:
%iv = add i16 %ivphi, 1
%c = icmp slt i16 %iv, 100
br i1 %c, label %loop.header, label %exit
exit:
ret void
}
declare dso_local void @foo()
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)

View File

@ -45,6 +45,7 @@
; CHECK-NEXT: Eliminate PHI nodes for register allocation
; CHECK-NEXT: Two-Address instruction pass
; CHECK-NEXT: Fast Register Allocator
; CHECK-NEXT: X86 Lower Tile Copy
; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: X86 FP Stackifier
; CHECK-NEXT: Fixup Statepoint Caller Saved

View File

@ -145,6 +145,7 @@
; CHECK-NEXT: Stack Slot Coloring
; CHECK-NEXT: Machine Copy Propagation Pass
; CHECK-NEXT: Machine Loop Invariant Code Motion
; CHECK-NEXT: X86 Lower Tile Copy
; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: X86 FP Stackifier
; CHECK-NEXT: MachineDominator Tree Construction