1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[AMDGPU] link dpp pseudos and real instructions on gfx10

This defaults to zero fi operand, but we do not expose it
anyway. Should we expose it later it needs to be added to
the pseudo.

This enables dpp combining on gfx10.

Differential Revision: https://reviews.llvm.org/D68888

llvm-svn: 374604
This commit is contained in:
Stanislav Mekhanoshin 2019-10-11 22:03:36 +00:00
parent 90fe48ddb1
commit 99df21352a
5 changed files with 5036 additions and 305 deletions

View File

@ -211,6 +211,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
lowerOperand(MO, MCOp); lowerOperand(MO, MCOp);
OutMI.addOperand(MCOp); OutMI.addOperand(MCOp);
} }
int FIIdx = AMDGPU::getNamedOperandIdx(MCOpcode, AMDGPU::OpName::fi);
if (FIIdx >= (int)OutMI.getNumOperands())
OutMI.addOperand(MCOperand::createImm(0));
} }
bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,

View File

@ -441,7 +441,7 @@ let SubtargetPredicate = isGFX10Plus in {
// Target-specific instruction encodings. // Target-specific instruction encodings.
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
VOP_DPP<ps.OpName, p, isDPP16> { VOP_DPP<ps.OpName, p, isDPP16> {
let hasSideEffects = ps.hasSideEffects; let hasSideEffects = ps.hasSideEffects;
let Defs = ps.Defs; let Defs = ps.Defs;
@ -455,8 +455,9 @@ class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 =
let Inst{31-25} = 0x3f; let Inst{31-25} = 0x3f;
} }
class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
VOP1_DPP<op, ps, p, 1> { VOP1_DPP<op, ps, p, 1>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
let SubtargetPredicate = HasDPP16; let SubtargetPredicate = HasDPP16;
} }
@ -507,7 +508,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
} }
multiclass VOP1_Real_dpp_gfx10<bits<9> op> { multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA10"; let DecoderNamespace = "SDWA10";
} }
} }
@ -840,7 +841,7 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
let OtherPredicates = [isGFX8GFX9] in { let OtherPredicates = [isGFX8Plus] in {
def : GCNPat < def : GCNPat <
(i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
@ -858,7 +859,7 @@ def : GCNPat <
(as_i1imm $bound_ctrl)) (as_i1imm $bound_ctrl))
>; >;
} // End OtherPredicates = [isGFX8GFX9] } // End OtherPredicates = [isGFX8Plus]
let OtherPredicates = [isGFX8Plus] in { let OtherPredicates = [isGFX8Plus] in {
def : GCNPat< def : GCNPat<
@ -916,20 +917,4 @@ def : GCNPat <
(i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
(V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
>; >;
def : GCNPat <
(i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
timm:$bound_ctrl)),
(V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl),
(as_i32imm $row_mask), (as_i32imm $bank_mask),
(as_i1imm $bound_ctrl), (i32 0))
>;
def : GCNPat <
(i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
(V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl),
(as_i32imm $row_mask), (as_i32imm $bank_mask),
(as_i1imm $bound_ctrl), (i32 0))
>;
} // End OtherPredicates = [isGFX10Plus] } // End OtherPredicates = [isGFX10Plus]

View File

@ -658,14 +658,14 @@ let Constraints = "$vdst = $src2",
isCommutable = 1, isCommutable = 1,
IsDOT = 1 in { IsDOT = 1 in {
let SubtargetPredicate = HasDot5Insts in let SubtargetPredicate = HasDot5Insts in
defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
let SubtargetPredicate = HasDot6Insts in let SubtargetPredicate = HasDot6Insts in
defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot4Insts in let SubtargetPredicate = HasDot4Insts in
defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot3Insts in let SubtargetPredicate = HasDot3Insts in
defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
} }
let AddedComplexity = 30 in { let AddedComplexity = 30 in {
@ -800,7 +800,7 @@ def : GCNPat<
// Target-specific instruction encodings. // Target-specific instruction encodings.
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
class VOP2_DPP<bits<6> op, VOP2_Pseudo ps, class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl, string opName = ps.OpName, VOPProfile p = ps.Pfl,
bit IsDPP16 = 0> : bit IsDPP16 = 0> :
VOP_DPP<opName, p, IsDPP16> { VOP_DPP<opName, p, IsDPP16> {
@ -818,13 +818,18 @@ class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
let Inst{31} = 0x0; let Inst{31} = 0x0;
} }
class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps, class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> : string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP2_DPP<op, ps, opName, p, 1> { VOP2_DPP<op, ps, opName, p, 1> {
let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
let SubtargetPredicate = HasDPP16; let SubtargetPredicate = HasDPP16;
} }
class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
Base_VOP2_DPP16<op, ps, opName, p>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> : string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP_DPP8<ps.OpName, p> { VOP_DPP8<ps.OpName, p> {
@ -885,7 +890,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
} }
multiclass VOP2_Real_dpp_gfx10<bits<6> op> { multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA10"; let DecoderNamespace = "SDWA10";
} }
} }
@ -929,7 +934,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName, multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
string asmName> { string asmName> {
foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> { def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16; let AsmString = asmName # ps.Pfl.AsmDPP16;
} }
@ -969,7 +974,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
} }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx10 : def _dpp_gfx10 :
VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst(", vcc", "", AsmDPP); let AsmString = asmName # !subst(", vcc", "", AsmDPP);
let DecoderNamespace = "SDWA10"; let DecoderNamespace = "SDWA10";
@ -992,7 +997,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let DecoderNamespace = "SDWA10"; let DecoderNamespace = "SDWA10";
} }
def _dpp_w32_gfx10 : def _dpp_w32_gfx10 :
VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
let isAsmParserOnly = 1; let isAsmParserOnly = 1;
@ -1015,7 +1020,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let DecoderNamespace = "SDWA10"; let DecoderNamespace = "SDWA10";
} }
def _dpp_w64_gfx10 : def _dpp_w64_gfx10 :
VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # AsmDPP; let AsmString = asmName # AsmDPP;
let isAsmParserOnly = 1; let isAsmParserOnly = 1;
@ -1513,7 +1518,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
} // End SubtargetPredicate = HasDLInsts } // End SubtargetPredicate = HasDLInsts
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> { multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
} }
multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> : multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,53 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_dword [[V:v[0-9]+]],
; GCN: v_add_{{(nc_)?}}u32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
%load = load i32, i32 addrspace(1)* %gep
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
%add = add i32 %tmp0, %load
store i32 %add, i32 addrspace(1)* %gep
ret void
}
; GCN-LABEL: {{^}}dpp_ceil:
; GCN: global_load_dword [[V:v[0-9]+]],
; GCN: v_ceil_f32_dpp [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
%load = load i32, i32 addrspace(1)* %gep
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
%tmp1 = bitcast i32 %tmp0 to float
%round = tail call float @llvm.ceil.f32(float %tmp1)
%tmp2 = bitcast float %round to i32
store i32 %tmp2, i32 addrspace(1)* %gep
ret void
}
; GCN-LABEL: {{^}}dpp_fadd:
; GCN: global_load_dword [[V:v[0-9]+]],
; GCN: v_add_f32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
define amdgpu_kernel void @dpp_fadd(i32 addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
%load = load i32, i32 addrspace(1)* %gep
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
%tmp1 = bitcast i32 %tmp0 to float
%t = bitcast i32 %load to float
%add = fadd float %tmp1, %t
%tmp2 = bitcast float %add to i32
store i32 %tmp2, i32 addrspace(1)* %gep
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
declare float @llvm.ceil.f32(float)
attributes #0 = { nounwind readnone convergent }