1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

AMDGPU: Move dot intrinsic patterns to instruction def

I tried to use some of the new tablegen features to avoid creating
different operand list permutations, but I still don't see a way to
programmatically build a source pattern dag.

Also add GlobalISel tests, which now all import successfully.

Some of the fneg fold tests are incorrect, which need to be fixed in a
future commit
This commit is contained in:
Matt Arsenault 2020-02-19 10:42:31 -05:00 committed by Matt Arsenault
parent 4dec0b4740
commit 13ad7999a2
8 changed files with 1280 additions and 36 deletions

View File

@ -32,18 +32,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
ret1));
}
class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> {
dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers));
dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers));
dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers));
dag clamp_dag = (i1 timm:$clamp);
list<dag> ret3 = [(set P.DstVT:$vdst,
(DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
!if(HasExplicitClamp,
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
list<dag> ret2 = [(set P.DstVT:$vdst,
(DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
!if(HasExplicitClamp,
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
list<dag> ret1 = [(set P.DstVT:$vdst,
(DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))))];
!if(HasExplicitClamp,
(DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
(DivergentFragOrOp<node, P>.ret src0_dag)))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,

View File

@ -10,9 +10,11 @@
// VOP3P Classes
//===----------------------------------------------------------------------===//
class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
class VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag,
bit HasExplicitClamp = 0> :
VOP3P_Pseudo<OpName, P,
!if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
!if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret)
>;
// Non-packed instructions that use the VOP3P encoding.
@ -269,42 +271,30 @@ class SDot2Pat<Instruction Inst> : GCNPat <
let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
AMDGPUfdot2, 1/*ExplicitClamp*/>;
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End SubtargetPredicate = HasDot2Insts
let SubtargetPredicate = HasDot1Insts in {
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End SubtargetPredicate = HasDot1Insts
} // End let IsDOT = 1
multiclass DotPats<SDPatternOperator dot_op,
VOP3PInst dot_inst> {
let SubtargetPredicate = dot_inst.SubtargetPredicate in
def : GCNPat <
(dot_op (dot_inst.Pfl.Src0VT (VOP3PMods dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
(dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
(dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), timm:$clamp),
(dot_inst $src0_modifiers, VSrc_v2f16:$src0,
$src1_modifiers, VSrc_v2f16:$src1,
$src2_modifiers, VSrc_f32:$src2, timm:$clamp)>;
}
defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;

View File

@ -0,0 +1,388 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_clamp:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_clamp:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
ret i32 %r
}
define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
; GFX906-LABEL: v_sdot2_sgpr_sgpr_sgpr:
; GFX906: ; %bb.0:
; GFX906-NEXT: v_mov_b32_e32 v0, s1
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: v_dot2_i32_i16 v0, s0, v0, v1
; GFX906-NEXT: ; return to shader part epilog
;
; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr:
; GFX908: ; %bb.0:
; GFX908-NEXT: v_mov_b32_e32 v0, s1
; GFX908-NEXT: v_mov_b32_e32 v1, s2
; GFX908-NEXT: v_dot2_i32_i16 v0, s0, v0, v1
; GFX908-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s0, s1, v0
; GFX10-NEXT: ; return to shader part epilog
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
%cast = bitcast i32 %r to float
ret float %cast
}
define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_inline_literal_a_b_c() {
; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
ret i32 %r
}
define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
; GFX906-LABEL: v_sdot2_inline_literal_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
ret i32 %r
}
define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_fneg_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fneg_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_fneg_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_fneg_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fneg_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_fneg_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.b = fneg <2 x half> %b
%cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_sdot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegf32_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_sdot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegv2f16_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
ret i32 %r
}
declare i32 @llvm.amdgcn.sdot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }

View File

@ -0,0 +1,141 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot4_clamp(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_clamp:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 clamp
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 true)
ret i32 %r
}
; FIXME: bitcast should not expand
define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
; GFX906-LABEL: v_sdot4_cast_v4i8:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_movk_i32 s4, 0xff
; GFX906-NEXT: v_and_b32_e32 v1, s4, v1
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_and_b32_e32 v1, s4, v3
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX906-NEXT: v_and_b32_e32 v2, s4, v7
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX906-NEXT: v_and_b32_e32 v1, s4, v5
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX906-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX906-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_cast_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v5, v5, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v7, v7, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX10-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX10-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; GFX10-NEXT: v_dot4_i32_i8 v0, v7, v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%a.cast = bitcast <4 x i8> %a to i32
%b.cast = bitcast <4 x i8> %b to i32
%r = call i32 @llvm.amdgcn.sdot4(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
%r = call i32 @llvm.amdgcn.sdot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegv2f16_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
%r = call i32 @llvm.amdgcn.sdot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
declare i32 @llvm.amdgcn.sdot4(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }

View File

@ -0,0 +1,94 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot8_clamp(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_clamp:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 clamp
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 true)
ret i32 %r
}
; FIXME: Fix argument do not let these casts expand
; define i32 @v_sdot8_cast_v8i4(<8 x i4> %a, <8 x i4> %b, i32 %c) {
; %a.cast = bitcast <8 x i4> %a to i32
; %b.cast = bitcast <8 x i4> %b to i32
; %r = call i32 @llvm.amdgcn.sdot8(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
; ret i32 %r
; }
define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
%r = call i32 @llvm.amdgcn.sdot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegv2f16_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
%r = call i32 @llvm.amdgcn.sdot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
declare i32 @llvm.amdgcn.sdot8(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }

View File

@ -0,0 +1,388 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_clamp:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_clamp:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
ret i32 %r
}
define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
; GFX906-LABEL: v_udot2_sgpr_sgpr_sgpr:
; GFX906: ; %bb.0:
; GFX906-NEXT: v_mov_b32_e32 v0, s1
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: v_dot2_u32_u16 v0, s0, v0, v1
; GFX906-NEXT: ; return to shader part epilog
;
; GFX908-LABEL: v_udot2_sgpr_sgpr_sgpr:
; GFX908: ; %bb.0:
; GFX908-NEXT: v_mov_b32_e32 v0, s1
; GFX908-NEXT: v_mov_b32_e32 v1, s2
; GFX908-NEXT: v_dot2_u32_u16 v0, s0, v0, v1
; GFX908-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_udot2_sgpr_sgpr_sgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s0, s1, v0
; GFX10-NEXT: ; return to shader part epilog
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
%cast = bitcast i32 %r to float
ret float %cast
}
define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_inline_literal_a_b_c() {
; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
ret i32 %r
}
define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
; GFX906-LABEL: v_udot2_inline_literal_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, 7
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, 7
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, 7
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
ret i32 %r
}
define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_fneg_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fneg_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fneg_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
; GFX906-LABEL: v_udot2_fneg_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fneg_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fneg_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.b = fneg <2 x half> %b
%cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_udot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegf32_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
ret i32 %r
}
define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_udot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegv2f16_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
ret i32 %r
}
define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
ret i32 %r
}
declare i32 @llvm.amdgcn.udot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }

View File

@ -0,0 +1,141 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot4_clamp(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_clamp:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot4_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 clamp
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 true)
ret i32 %r
}
; FIXME: bitcast should not expand
define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
; GFX906-LABEL: v_udot4_cast_v4i8:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_movk_i32 s4, 0xff
; GFX906-NEXT: v_and_b32_e32 v1, s4, v1
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_and_b32_e32 v1, s4, v3
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX906-NEXT: v_and_b32_e32 v2, s4, v7
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX906-NEXT: v_and_b32_e32 v1, s4, v5
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX906-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX906-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot4_cast_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v5, v5, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v7, v7, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX10-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX10-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; GFX10-NEXT: v_dot4_u32_u8 v0, v7, v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%a.cast = bitcast <4 x i8> %a to i32
%b.cast = bitcast <4 x i8> %b to i32
%r = call i32 @llvm.amdgcn.udot4(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot4_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
%r = call i32 @llvm.amdgcn.udot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot4_fnegv2f16_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
%r = call i32 @llvm.amdgcn.udot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
declare i32 @llvm.amdgcn.udot4(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }

View File

@ -0,0 +1,94 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot8_clamp(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_clamp:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot8_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 clamp
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 true)
ret i32 %r
}
; FIXME: Fix argument do not let these casts expand
; define i32 @v_udot8_cast_v8i4(<8 x i4> %a, <8 x i4> %b, i32 %c) {
; %a.cast = bitcast <8 x i4> %a to i32
; %b.cast = bitcast <8 x i4> %b to i32
; %r = call i32 @llvm.amdgcn.udot8(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
; ret i32 %r
; }
define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot8_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
%r = call i32 @llvm.amdgcn.udot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot8_fnegv2f16_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
%r = call i32 @llvm.amdgcn.udot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
ret i32 %r
}
declare i32 @llvm.amdgcn.udot8(i32, i32, i32, i1 immarg) #0
attributes #0 = { nounwind readnone speculatable }