1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[X86-64] Support Intel AMX Intrinsic

INTEL ADVANCED MATRIX EXTENSIONS (AMX).
AMX is a new programming paradigm, it has a set of 2-dimensional registers
(TILES) representing sub-arrays from a larger 2-dimensional memory image and
operate on TILES.

These intrinsics use direct TMM register number as its params.

Spec can be found in Chapter 3 here https://software.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D83111
This commit is contained in:
Xiang1 Zhang 2020-07-07 09:50:17 +08:00
parent ff0e91295b
commit 535fe28ef4
7 changed files with 245 additions and 4 deletions

View File

@ -4948,3 +4948,32 @@ let TargetPrefix = "x86" in {
def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">, def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">,
Intrinsic<[], [], []>; Intrinsic<[], [], []>;
} }
//===----------------------------------------------------------------------===//
// AMX - Intel AMX extensions
let TargetPrefix = "x86" in {
def int_x86_ldtilecfg : GCCBuiltin<"__builtin_ia32_tile_loadconfig">,
Intrinsic<[], [llvm_ptr_ty], []>;
def int_x86_sttilecfg : GCCBuiltin<"__builtin_ia32_tile_storeconfig">,
Intrinsic<[], [llvm_ptr_ty], []>;
def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">,
Intrinsic<[], [], []>;
def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">,
Intrinsic<[], [llvm_i8_ty], []>;
def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">,
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">,
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">,
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
}

View File

@ -4435,8 +4435,39 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break; break;
} }
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {
if (!Subtarget->hasAMXTILE())
break;
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
}
// FIXME: Match displacement and scale.
unsigned TIndex = Node->getConstantOperandVal(2);
SDValue TReg = getI8Imm(TIndex, dl);
SDValue Base = Node->getOperand(3);
SDValue Scale = getI8Imm(1, dl);
SDValue Index = Node->getOperand(4);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
if (Opc == X86::PTILESTORED) {
SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
} else {
SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
}
ReplaceNode(Node, CNode);
return;
}
} }
break; break;
} }
case ISD::BRIND: { case ISD::BRIND: {

View File

@ -33044,6 +33044,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc(); DebugLoc DL = MI.getDebugLoc();
auto TMMImmToTMMReg = [](unsigned Imm) {
assert (Imm < 8 && "Illegal tmm index");
return X86::TMM0 + Imm;
};
switch (MI.getOpcode()) { switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert"); default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32: case X86::TLS_addr32:
@ -33326,6 +33330,67 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); MI.eraseFromParent();
return BB; return BB;
} }
case X86::PTDPBSSD:
case X86::PTDPBSUD:
case X86::PTDPBUSD:
case X86::PTDPBUUD:
case X86::PTDPBF16PS: {
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc;
switch (MI.getOpcode()) {
case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
case X86::PTILEZERO: {
const DebugLoc &DL = MI.getDebugLoc();
unsigned Imm = MI.getOperand(0).getImm();
BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
case X86::PTILELOADD:
case X86::PTILELOADDT1:
case X86::PTILESTORED: {
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc;
switch (MI.getOpcode()) {
case X86::PTILELOADD: Opc = X86::TILELOADD; break;
case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
case X86::PTILESTORED: Opc = X86::TILESTORED; break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
unsigned CurOp = 0;
if (Opc != X86::TILESTORED)
MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
RegState::Define);
MIB.add(MI.getOperand(CurOp++)); // base
MIB.add(MI.getOperand(CurOp++)); // scale
MIB.add(MI.getOperand(CurOp++)); // index -- stride
MIB.add(MI.getOperand(CurOp++)); // displacement
MIB.add(MI.getOperand(CurOp++)); // segment
if (Opc == X86::TILESTORED)
MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
RegState::Undef);
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
} }
} }

View File

@ -18,9 +18,11 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let SchedRW = [WriteSystem] in { let SchedRW = [WriteSystem] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"ldtilecfg\t$src", []>, VEX, T8PS; "ldtilecfg\t$src",
[(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"sttilecfg\t$src", []>, VEX, T8PD; "sttilecfg\t$src",
[(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src), (ins sibmem:$src),
"tileloadd\t{$src, $dst|$dst, $src}", []>, "tileloadd\t{$src, $dst|$dst, $src}", []>,
@ -31,7 +33,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8PD; VEX, T8PD;
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins), def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
"tilerelease", []>, VEX, T8PS; "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs), def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
(ins sibmem:$dst, TILE:$src), (ins sibmem:$dst, TILE:$src),
"tilestored\t{$src, $dst|$dst, $src}", []>, "tilestored\t{$src, $dst|$dst, $src}", []>,
@ -39,6 +41,17 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins), def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
"tilezero\t$dst", []>, "tilezero\t$dst", []>,
VEX, T8XD; VEX, T8XD;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
sibmem:$src2), []>;
def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
[(int_x86_tilezero imm:$src)]>;
}
} // SchedRW } // SchedRW
} // HasAMXTILE } // HasAMXTILE
@ -62,6 +75,27 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
"tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
VEX_4V, T8PS; VEX_4V, T8PS;
} }
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
[(int_x86_tdpbssd imm:$src1,
imm:$src2, imm:$src3)]>;
def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
[(int_x86_tdpbsud imm:$src1,
imm:$src2, imm:$src3)]>;
def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
[(int_x86_tdpbusd imm:$src1,
imm:$src2, imm:$src3)]>;
def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
[(int_x86_tdpbuud imm:$src1,
imm:$src2, imm:$src3)]>;
}
} }
} // HasAMXTILE } // HasAMXTILE
@ -72,5 +106,14 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
(ins TILE:$src1, TILE:$src2, TILE:$src3), (ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[]>, VEX_4V, T8XS; []>, VEX_4V, T8XS;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
[(int_x86_tdpbf16ps imm:$src1,
imm:$src2, imm:$src3)]>;
}
} }
} // HasAMXTILE, HasAMXBF16 } // HasAMXTILE, HasAMXBF16

View File

@ -0,0 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+amx-bf16 -verify-machineinstrs | FileCheck %s
define void @test_amx() {
; CHECK-LABEL: test_amx:
; CHECK: # %bb.0:
; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
; CHECK-NEXT: retq
call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
ret void
}
declare void @llvm.x86.tdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)

View File

@ -0,0 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s
define void @test_amx() {
; CHECK-LABEL: test_amx:
; CHECK: # %bb.0:
call void @llvm.x86.tdpbssd(i8 3, i8 4, i8 7)
; CHECK-NEXT: tdpbssd %tmm7, %tmm4, %tmm3
call void @llvm.x86.tdpbsud(i8 3, i8 4, i8 7)
; CHECK-NEXT: tdpbsud %tmm7, %tmm4, %tmm3
call void @llvm.x86.tdpbusd(i8 3, i8 0, i8 7)
; CHECK-NEXT: tdpbusd %tmm7, %tmm0, %tmm3
call void @llvm.x86.tdpbuud(i8 3, i8 4, i8 1)
; CHECK-NEXT: tdpbuud %tmm1, %tmm4, %tmm3
ret void
}
declare void @llvm.x86.tdpbssd(i8 %tile0, i8 %tile1, i8 %tile2)
declare void @llvm.x86.tdpbsud(i8 %tile0, i8 %tile1, i8 %tile2)
declare void @llvm.x86.tdpbusd(i8 %tile0, i8 %tile1, i8 %tile2)
declare void @llvm.x86.tdpbuud(i8 %tile0, i8 %tile1, i8 %tile2)

View File

@ -0,0 +1,36 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s
define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
; CHECK-LABEL: test_amx:
; CHECK: # %bb.0:
call void @llvm.x86.ldtilecfg(i8* %pointer)
; CHECK-NEXT: ldtilecfg (%rdi)
call void @llvm.x86.sttilecfg(i8* %pointer)
; CHECK-NEXT: sttilecfg (%rdi)
call void @llvm.x86.tilerelease()
; CHECK-NEXT: tilerelease
call void @llvm.x86.tilezero(i8 3)
; CHECK-NEXT: tilezero %tmm3
call void @llvm.x86.tileloadd64(i8 3, i8* %base, i64 %stride)
; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm3
call void @llvm.x86.tileloaddt164(i8 3, i8* %base, i64 %stride)
; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm3
call void @llvm.x86.tilestored64(i8 3, i8* %base, i64 %stride)
; CHECK-NEXT: tilestored %tmm3, (%rsi,%rdx)
ret void
}
declare void @llvm.x86.tileloadd64(i8 %tile, i8* %base, i64 %stride)
declare void @llvm.x86.tileloaddt164(i8 %tile, i8* %base, i64 %stride)
declare void @llvm.x86.tilestored64(i8 %tile, i8* %base, i64 %stride)
declare void @llvm.x86.ldtilecfg(i8* %pointer)
declare void @llvm.x86.sttilecfg(i8* %pointer)
declare void @llvm.x86.tilerelease()
declare void @llvm.x86.tilezero(i8 %tile)