mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
AMDGPU: Move dot intrinsic patterns to instruction def
I tried to use some of the new tablegen features to avoid creating different operand list permutations, but I still don't see a way to programmatically build a source pattern dag. Also add GlobalISel tests, which now all import successfully. Some of the fneg fold tests are incorrect, which need to be fixed in a future commit
This commit is contained in:
parent
4dec0b4740
commit
13ad7999a2
@ -32,18 +32,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
|
||||
ret1));
|
||||
}
|
||||
|
||||
class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
|
||||
class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> {
|
||||
dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers));
|
||||
dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers));
|
||||
dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers));
|
||||
dag clamp_dag = (i1 timm:$clamp);
|
||||
|
||||
list<dag> ret3 = [(set P.DstVT:$vdst,
|
||||
(DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
|
||||
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
|
||||
(P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
|
||||
!if(HasExplicitClamp,
|
||||
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
|
||||
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
|
||||
|
||||
list<dag> ret2 = [(set P.DstVT:$vdst,
|
||||
(DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
|
||||
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
|
||||
!if(HasExplicitClamp,
|
||||
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
|
||||
(DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
|
||||
|
||||
list<dag> ret1 = [(set P.DstVT:$vdst,
|
||||
(DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))))];
|
||||
!if(HasExplicitClamp,
|
||||
(DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
|
||||
(DivergentFragOrOp<node, P>.ret src0_dag)))];
|
||||
|
||||
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
|
||||
!if(!eq(P.NumSrcArgs, 2), ret2,
|
||||
|
@ -10,9 +10,11 @@
|
||||
// VOP3P Classes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
|
||||
class VOP3PInst<string OpName, VOPProfile P,
|
||||
SDPatternOperator node = null_frag,
|
||||
bit HasExplicitClamp = 0> :
|
||||
VOP3P_Pseudo<OpName, P,
|
||||
!if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
|
||||
!if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret)
|
||||
>;
|
||||
|
||||
// Non-packed instructions that use the VOP3P encoding.
|
||||
@ -269,42 +271,30 @@ class SDot2Pat<Instruction Inst> : GCNPat <
|
||||
let IsDOT = 1 in {
|
||||
let SubtargetPredicate = HasDot2Insts in {
|
||||
|
||||
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
|
||||
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
|
||||
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
|
||||
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
|
||||
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
|
||||
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
|
||||
VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
|
||||
AMDGPUfdot2, 1/*ExplicitClamp*/>;
|
||||
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
|
||||
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
|
||||
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
|
||||
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
|
||||
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
|
||||
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
|
||||
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
|
||||
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
|
||||
|
||||
} // End SubtargetPredicate = HasDot2Insts
|
||||
|
||||
let SubtargetPredicate = HasDot1Insts in {
|
||||
|
||||
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
|
||||
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
|
||||
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
|
||||
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
|
||||
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
|
||||
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
|
||||
|
||||
} // End SubtargetPredicate = HasDot1Insts
|
||||
} // End let IsDOT = 1
|
||||
|
||||
multiclass DotPats<SDPatternOperator dot_op,
|
||||
VOP3PInst dot_inst> {
|
||||
let SubtargetPredicate = dot_inst.SubtargetPredicate in
|
||||
def : GCNPat <
|
||||
(dot_op (dot_inst.Pfl.Src0VT (VOP3PMods dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
|
||||
(dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
|
||||
(dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), timm:$clamp),
|
||||
(dot_inst $src0_modifiers, VSrc_v2f16:$src0,
|
||||
$src1_modifiers, VSrc_v2f16:$src1,
|
||||
$src2_modifiers, VSrc_f32:$src2, timm:$clamp)>;
|
||||
}
|
||||
|
||||
defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
|
||||
defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
|
||||
defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
|
||||
defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
|
||||
defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
|
||||
defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
|
||||
defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
|
||||
|
||||
def : UDot2Pat<V_DOT2_U32_U16>;
|
||||
def : SDot2Pat<V_DOT2_I32_I16>;
|
||||
|
||||
|
388
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
Normal file
388
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
Normal file
@ -0,0 +1,388 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_clamp:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_clamp:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
|
||||
; GFX906-LABEL: v_sdot2_sgpr_sgpr_sgpr:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, s0, v0, v1
|
||||
; GFX906-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, s0, v0, v1
|
||||
; GFX908-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, s0, s1, v0
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
|
||||
%cast = bitcast i32 %r to float
|
||||
ret float %cast
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_inline_literal_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_inline_literal_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_inline_literal_a_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX906-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_a_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_a_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_inline_literal_a_b_c() {
|
||||
; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX906-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
|
||||
; GFX906-LABEL: v_sdot2_inline_literal_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_fneg_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_fneg_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fneg_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_fneg_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_fneg_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fneg_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.b = fneg <2 x half> %b
|
||||
%cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
|
||||
; GFX906-LABEL: v_sdot2_fnegf32_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_fnegf32_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fnegf32_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.c = fneg float %c
|
||||
%cast.neg.c = bitcast float %neg.c to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
|
||||
; GFX906-LABEL: v_sdot2_fnegv2f16_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_fnegv2f16_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.c = fneg <2 x half> %c
|
||||
%cast.neg.c = bitcast <2 x half> %neg.c to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_shuffle10_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_shuffle10_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_shuffle10_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot2_shuffle10_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_shuffle10_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_shuffle10_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.sdot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
141
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
Normal file
141
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
Normal file
@ -0,0 +1,141 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot4:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot4:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot4_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot4_clamp:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot4_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 true)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; FIXME: bitcast should not expand
|
||||
define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot4_cast_v4i8:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
||||
; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, s4, v3
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
||||
; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX906-NEXT: v_and_b32_e32 v2, s4, v7
|
||||
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, s4, v5
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
|
||||
; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX906-NEXT: v_bfe_u32 v2, v2, 0, 16
|
||||
; GFX906-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v8
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot4_cast_v4i8:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_and_b32_sdwa v5, v5, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_and_b32_sdwa v7, v7, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX10-NEXT: v_bfe_u32 v2, v2, 0, 16
|
||||
; GFX10-NEXT: v_bfe_u32 v3, v3, 0, 16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v7, v1, v8
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%a.cast = bitcast <4 x i8> %a to i32
|
||||
%b.cast = bitcast <4 x i8> %b to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot4(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot4_fnegf32_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot4_fnegf32_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg float %a
|
||||
%cast.neg.a = bitcast float %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot4_fnegv2f16_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot4_fnegv2f16_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.sdot4(i32, i32, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
94
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
Normal file
94
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
Normal file
@ -0,0 +1,94 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot8:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot8:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot8_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot8_clamp:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot8_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 true)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; FIXME: Fix argument do not let these casts expand
|
||||
; define i32 @v_sdot8_cast_v8i4(<8 x i4> %a, <8 x i4> %b, i32 %c) {
|
||||
; %a.cast = bitcast <8 x i4> %a to i32
|
||||
; %b.cast = bitcast <8 x i4> %b to i32
|
||||
; %r = call i32 @llvm.amdgcn.sdot8(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
|
||||
; ret i32 %r
|
||||
; }
|
||||
|
||||
define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot8_fnegf32_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot8_fnegf32_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg float %a
|
||||
%cast.neg.a = bitcast float %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_sdot8_fnegv2f16_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot8_fnegv2f16_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.sdot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.sdot8(i32, i32, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
388
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
Normal file
388
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
Normal file
@ -0,0 +1,388 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_clamp:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_clamp:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
|
||||
; GFX906-LABEL: v_udot2_sgpr_sgpr_sgpr:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, s0, v0, v1
|
||||
; GFX906-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_sgpr_sgpr_sgpr:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, s0, v0, v1
|
||||
; GFX908-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_sgpr_sgpr_sgpr:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, s0, s1, v0
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
|
||||
%cast = bitcast i32 %r to float
|
||||
ret float %cast
|
||||
}
|
||||
|
||||
define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_inline_literal_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_inline_literal_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_inline_literal_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_inline_literal_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_inline_literal_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_inline_literal_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_inline_literal_a_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX906-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_inline_literal_a_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_inline_literal_a_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_inline_literal_a_b_c() {
|
||||
; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX906-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
|
||||
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
|
||||
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
|
||||
; GFX906-LABEL: v_udot2_inline_literal_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, 7
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_inline_literal_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, 7
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_inline_literal_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, 7
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_fneg_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_fneg_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_fneg_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_fneg_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_fneg_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_fneg_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.b = fneg <2 x half> %b
|
||||
%cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
|
||||
; GFX906-LABEL: v_udot2_fnegf32_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_fnegf32_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_fnegf32_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.c = fneg float %c
|
||||
%cast.neg.c = bitcast float %neg.c to i32
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
|
||||
; GFX906-LABEL: v_udot2_fnegv2f16_c:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_fnegv2f16_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_fnegv2f16_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.c = fneg <2 x half> %c
|
||||
%cast.neg.c = bitcast <2 x half> %neg.c to i32
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_shuffle10_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_shuffle10_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_shuffle10_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot2_shuffle10_b:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: v_udot2_shuffle10_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot2_shuffle10_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
|
||||
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.udot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
141
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
Normal file
141
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
Normal file
@ -0,0 +1,141 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot4:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot4:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot4_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot4_clamp:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot4_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 true)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; FIXME: bitcast should not expand
|
||||
define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot4_cast_v4i8:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
||||
; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, s4, v3
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
||||
; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX906-NEXT: v_and_b32_e32 v2, s4, v7
|
||||
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, s4, v5
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
||||
; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
|
||||
; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX906-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX906-NEXT: v_bfe_u32 v2, v2, 0, 16
|
||||
; GFX906-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v8
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot4_cast_v4i8:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_and_b32_sdwa v5, v5, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_and_b32_sdwa v7, v7, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX10-NEXT: v_bfe_u32 v2, v2, 0, 16
|
||||
; GFX10-NEXT: v_bfe_u32 v3, v3, 0, 16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10-NEXT: v_dot4_u32_u8 v0, v7, v1, v8
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%a.cast = bitcast <4 x i8> %a to i32
|
||||
%b.cast = bitcast <4 x i8> %b to i32
|
||||
%r = call i32 @llvm.amdgcn.udot4(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot4_fnegf32_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot4_fnegf32_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg float %a
|
||||
%cast.neg.a = bitcast float %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.udot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot4_fnegv2f16_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot4_fnegv2f16_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.udot4(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.udot4(i32, i32, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
94
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
Normal file
94
test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
Normal file
@ -0,0 +1,94 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot8:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot8:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot8_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot8_clamp:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot8_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 true)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; FIXME: Fix argument do not let these casts expand
|
||||
; define i32 @v_udot8_cast_v8i4(<8 x i4> %a, <8 x i4> %b, i32 %c) {
|
||||
; %a.cast = bitcast <8 x i4> %a to i32
|
||||
; %b.cast = bitcast <8 x i4> %b to i32
|
||||
; %r = call i32 @llvm.amdgcn.udot8(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
|
||||
; ret i32 %r
|
||||
; }
|
||||
|
||||
define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot8_fnegf32_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot8_fnegf32_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg float %a
|
||||
%cast.neg.a = bitcast float %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.udot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
|
||||
; GFX906-LABEL: v_udot8_fnegv2f16_a:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_udot8_fnegv2f16_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to i32
|
||||
%r = call i32 @llvm.amdgcn.udot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.udot8(i32, i32, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
Loading…
Reference in New Issue
Block a user