1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 10:42:39 +01:00

[X86] Support __tile_stream_loadd intrinsic for new AMX interface

Adding support for __tile_stream_loadd intrinsic.

Reviewed By: LuoYuanke

Differential Revision: https://reviews.llvm.org/D103784
This commit is contained in:
Bing1 Yu 2021-06-11 17:20:58 +08:00
parent 5c5e621290
commit c7250ce1db
9 changed files with 27 additions and 6 deletions

View File

@ -5050,6 +5050,11 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;
def int_x86_tileloaddt164_internal :
GCCBuiltin<"__builtin_ia32_tileloaddt164_internal">,
Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;
def int_x86_tdpbssd_internal :
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
Intrinsic<[llvm_x86amx_ty],

View File

@ -554,10 +554,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(X86::LDTILECFG));
return true;
}
case X86::PTILELOADDV: {
case X86::PTILELOADDV:
case X86::PTILELOADDT1V: {
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILELOADD));
unsigned Opc =
Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
MI.setDesc(TII->get(Opc));
return true;
}
case X86::PTDPBSSDV:

View File

@ -122,7 +122,8 @@ static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
}
bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
return MI.getOpcode() == X86::PTILELOADDV;
return MI.getOpcode() == X86::PTILELOADDV ||
MI.getOpcode() == X86::PTILELOADDT1V;
}
bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
return MI.getOpcode() == X86::PTILESTOREDV;

View File

@ -4617,10 +4617,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
case Intrinsic::x86_tileloadd64_internal: {
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
break;
unsigned Opc = X86::PTILELOADDV;
unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
? X86::PTILELOADDV
: X86::PTILELOADDT1V;
// _tile_loadd_internal(row, col, buf, STRIDE)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);

View File

@ -53,6 +53,9 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;

View File

@ -121,6 +121,7 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
default:
llvm_unreachable("Expect amx intrinsics");
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal:
case Intrinsic::x86_tilestored64_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);

View File

@ -65,7 +65,8 @@ static bool isAMXIntrinsic(IntrinsicInst *II) {
}
static bool isTileLoad(IntrinsicInst *II) {
return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
}
static bool isTileStore(IntrinsicInst *II) {

View File

@ -892,6 +892,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
}
// We only collect the tile shape that is defined.
case X86::PTILELOADDV:
case X86::PTILELOADDT1V:
case X86::PTDPBSSDV:
case X86::PTDPBSUDV:
case X86::PTDPBUSDV:

View File

@ -23,6 +23,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1
; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
@ -35,6 +36,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
%d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
%d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
%d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
%e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
ret void
@ -42,6 +44,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)