[X86][AMX] Lower tile copy instruction.

Since there is no tile copy instruction, we need to store tile register to stack and load from stack to another tile register. We need extra GR to hold the stride, and we need stack slot to hold the tile data register. We would run this pass after copy propagation, so that we don't miss copy optimization. And we would run this pass before prolog/epilog insertion, so that we can allocate stack slot. Differential Revision: https://reviews.llvm.org/D97112
2024-11-22 18:54:02 +01:00 · 2021-02-20 15:05:07 +08:00 · 2021-02-20 15:05:07 +08:00 · ede5e5d465
commit ede5e5d465
parent 771be1b79d
8 changed files with 330 additions and 0 deletions
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@ -32,6 +32,7 @@ set(sources
  X86CmovConversion.cpp
  X86DomainReassignment.cpp
  X86DiscriminateMemOps.cpp
+  X86LowerTileCopy.cpp
  X86LowerAMXType.cpp
  X86TileConfig.cpp
  X86PreTileConfig.cpp
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@ -76,10 +76,15 @@ FunctionPass *createX86FlagsCopyLoweringPass();
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();

+/// Return a pass that config the tile registers.
 FunctionPass *createX86TileConfigPass();

+/// Return a pass that insert pseudo tile config instruction.
 FunctionPass *createX86PreTileConfigPass();

+/// Return a pass that lower the tile copy instruction.
+FunctionPass *createX86LowerTileCopyPass();
+
 /// Return a pass that inserts int3 at the end of the function if it ends with a
 /// CALL instruction. The pass does the same for each funclet as well. This
 /// ensures that the open interval of function start and end PCs contains all
@ -169,6 +174,7 @@ void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
 void initializeX86TileConfigPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+void initializeX86LowerTileCopyPass(PassRegistry &);

 namespace X86AS {
 enum : unsigned {
--- a/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/lib/Target/X86/X86LowerTileCopy.cpp
@ -0,0 +1,132 @@
+//===-- X86LowerTileCopy.cpp - Expand Tile Copy Instructions---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which lower AMX tile copy instructions. Since
+// there is no tile copy instruction, we need store tile register to stack
+// and load from stack to another tile register. We need extra GR to hold
+// the stride, and we need stack slot to hold the tile data register.
+// We would run this pass after copy propagation, so that we don't miss copy
+// optimization. And we would run this pass before prolog/epilog insertion,
+// so that we can allocate stack slot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-lower-tile-copy"
+
+namespace {
+
+class X86LowerTileCopy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  X86LowerTileCopy() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "X86 Lower Tile Copy"; }
+};
+
+} // namespace
+
+char X86LowerTileCopy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
+                      false, false)
+INITIALIZE_PASS_END(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
+                    false, false)
+
+void X86LowerTileCopy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createX86LowerTileCopyPass() {
+  return new X86LowerTileCopy();
+}
+
+bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  const X86InstrInfo *TII = ST.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+         MII != MIE;) {
+      MachineInstr &MI = *MII++;
+      if (!MI.isCopy())
+        continue;
+      MachineOperand &DstMO = MI.getOperand(0);
+      MachineOperand &SrcMO = MI.getOperand(1);
+      Register SrcReg = SrcMO.getReg();
+      Register DstReg = DstMO.getReg();
+      if (!X86::TILERegClass.contains(DstReg, SrcReg))
+        continue;
+
+      const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+      // Allocate stack slot for tile register
+      unsigned Size = TRI->getSpillSize(X86::TILERegClass);
+      Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
+      int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+      // Allocate stack slot for stride register
+      Size = TRI->getSpillSize(X86::GR64RegClass);
+      Alignment = TRI->getSpillAlign(X86::GR64RegClass);
+      int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+
+      // TODO: Pick a killed regiter to avoid save/reload. There is problem
+      // to get live interval in this stage.
+      Register GR64Cand = X86::RAX;
+
+      const DebugLoc &DL = MI.getDebugLoc();
+      // mov %rax (%sp)
+      BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
+      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
+          .addReg(GR64Cand);
+      // mov 64 %rax
+      BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      // tilestored %tmm, (%sp, %idx)
+      unsigned Opc = X86::TILESTORED;
+      MachineInstr *NewMI =
+          addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
+              .addReg(SrcReg, getKillRegState(SrcMO.isKill()));
+      MachineOperand &MO = NewMI->getOperand(2);
+      MO.setReg(GR64Cand);
+      MO.setIsKill(true);
+      // tileloadd (%sp, %idx), %tmm
+      Opc = X86::TILELOADD;
+      NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
+                                TileSS);
+      // restore %rax
+      // mov (%sp) %rax
+      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
+                        StrideSS);
+      MI.eraseFromParent();
+      Changed = true;
+    }
+  }
+  return Changed;
+}
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@ -875,6 +875,12 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
  default:
    llvm_unreachable("Unexpected machine instruction on tile register!");
    break;
+  case X86::COPY: {
+    Register SrcReg = MI->getOperand(1).getReg();
+    ShapeT Shape = getTileShape(SrcReg, VRM, MRI);
+    VRM->assignVirt2Shape(VirtReg, Shape);
+    return Shape;
+  }
  // We only collect the tile shape that is defined.
  case X86::PTILELOADDV:
  case X86::PTDPBSSDV:
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@ -73,6 +73,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
  initializeX86CallFrameOptimizationPass(PR);
  initializeX86CmovConverterPassPass(PR);
  initializeX86TileConfigPass(PR);
+  initializeX86LowerTileCopyPass(PR);
  initializeX86ExpandPseudoPass(PR);
  initializeX86ExecutionDomainFixPass(PR);
  initializeX86DomainReassignmentPass(PR);
@ -508,6 +509,7 @@ void X86PassConfig::addMachineSSAOptimization() {
 }

 void X86PassConfig::addPostRegAlloc() {
+  addPass(createX86LowerTileCopyPass());
  addPass(createX86FloatingPointStackifierPass());
  // When -O0 is enabled, the Load Value Injection Hardening pass will fall back
  // to using the Speculative Execution Side Effect Suppression pass for
--- a/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+
+define dso_local void @test1(i8 *%buf) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $64, %eax
+; CHECK-NEXT:    movw $8, %r14w
+; CHECK-NEXT:    tileloadd (%rdi,%rax), %tmm3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %loop.header.preheader
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl $32, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %loop.header
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
+; CHECK-NEXT:    # implicit-def: $rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpw $100, %bp
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %exit
+; CHECK-NEXT:    addq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 64)
+  br i1 undef, label %loop.header, label %exit
+
+loop.header:
+  %ivphi = phi i16 [0, %entry], [%iv, %loop.latch]
+  call void @foo()
+  br label %loop.body
+
+loop.body:
+  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
+  tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4)
+  br label %loop.latch
+
+loop.latch:
+  %iv = add i16 %ivphi, 1
+  %c = icmp slt i16 %iv, 100
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+define dso_local void @test2(i8 *%buf) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, %r14w
+; CHECK-NEXT:    tilezero %tmm3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB1_3
+; CHECK-NEXT:  # %bb.1: # %loop.header.preheader
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl $32, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_2: # %loop.header
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
+; CHECK-NEXT:    # implicit-def: $rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpw $100, %bp
+; CHECK-NEXT:    jl .LBB1_2
+; CHECK-NEXT:  .LBB1_3: # %exit
+; CHECK-NEXT:    addq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+  br i1 undef, label %loop.header, label %exit
+
+loop.header:
+  %ivphi = phi i16 [0, %entry], [%iv, %loop.latch]
+  call void @foo()
+  br label %loop.body
+
+loop.body:
+  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
+  tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4)
+  br label %loop.latch
+
+loop.latch:
+  %iv = add i16 %ivphi, 1
+  %c = icmp slt i16 %iv, 100
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+declare dso_local void @foo()
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@ -45,6 +45,7 @@
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Fast Register Allocator
+; CHECK-NEXT:       X86 Lower Tile Copy
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       X86 FP Stackifier
 ; CHECK-NEXT:       Fixup Statepoint Caller Saved
--- a/test/CodeGen/X86/opt-pipeline.ll
+++ b/test/CodeGen/X86/opt-pipeline.ll
@ -145,6 +145,7 @@
 ; CHECK-NEXT:       Stack Slot Coloring
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Machine Loop Invariant Code Motion
+; CHECK-NEXT:       X86 Lower Tile Copy
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       X86 FP Stackifier
 ; CHECK-NEXT:       MachineDominator Tree Construction