1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[PowerPC] Add intrinsics for MMA

This patch adds support for MMA intrinsics.

Authored by: Baptiste Saleil

Reviewed By: #powerpc, bsaleil, amyk

Differential Revision: https://reviews.llvm.org/D89345
This commit is contained in:
Baptiste Saleil 2020-10-13 14:19:15 -05:00 committed by Ahsan Saghir
parent bac894562a
commit e248116f2c
5 changed files with 2767 additions and 2 deletions

View File

@ -141,6 +141,28 @@ let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
Intrinsic<ret_types, param_types, properties>;
}
//===----------------------------------------------------------------------===//
// PowerPC MMA Intrinsic Multi Class Definitions.
//
multiclass PowerPC_MMA_ACC_Intrinsic<list<LLVMType> args> {
def NAME: Intrinsic<[llvm_v512i1_ty], args, [IntrNoMem]>;
def pp : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
[IntrNoMem]>;
def pn : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
[IntrNoMem]>;
def np : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
[IntrNoMem]>;
def nn : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
[IntrNoMem]>;
}
multiclass PowerPC_MMA_ACC_PP_Intrinsic<list<LLVMType> args> {
def NAME: Intrinsic<[llvm_v512i1_ty], args, [IntrNoMem]>;
def pp : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
[IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
// PowerPC Altivec Intrinsic Class Definitions.
//
@ -1371,7 +1393,6 @@ def int_ppc_cfence : Intrinsic<[], [llvm_anyint_ty], []>;
// PowerPC set FPSCR Intrinsic Definitions.
def int_ppc_setrnd : GCCBuiltin<"__builtin_setrnd">,
Intrinsic<[llvm_double_ty], [llvm_i32_ty], []>;
}
let TargetPrefix = "ppc" in {
@ -1400,5 +1421,60 @@ let TargetPrefix = "ppc" in {
def int_ppc_mma_xxsetaccz :
Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;
}
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty]>;
defm int_ppc_mma_xvi8ger4 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvi8ger4 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty]>;
defm int_ppc_mma_xvi16ger2s :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvi16ger2s :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty]>;
defm int_ppc_mma_xvf16ger2 :
PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvf16ger2 :
PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty]>;
defm int_ppc_mma_xvf32ger :
PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvf32ger :
PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty]>;
defm int_ppc_mma_xvf64ger :
PowerPC_MMA_ACC_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvf64ger :
PowerPC_MMA_ACC_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty]>;
// MMA Reduced-Precision: bfloat16 Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvbf16ger2 :
PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvbf16ger2 :
PowerPC_MMA_ACC_Intrinsic<
[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
// MMA Reduced-Precision: Missing Integer-based Outer Product Operations.
defm int_ppc_mma_xvi16ger2 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
defm int_ppc_mma_pmxvi16ger2 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty]>;
def int_ppc_mma_xvi8ger4spp :
Intrinsic<[llvm_v512i1_ty],
[llvm_v512i1_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
def int_ppc_mma_pmxvi8ger4spp :
Intrinsic<[llvm_v512i1_ty],
[llvm_v512i1_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
}

View File

@ -1,3 +1,8 @@
// Mask immediates for MMA instructions (2, 4 and 8 bits).
def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
//===----------------------------------------------------------------------===//
// PowerPC ISA 3.1 specific type constraints.
//
@ -1341,6 +1346,220 @@ defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB),
"xvf64ger", "$AT, $XA, $XB">;
//------------------------------------------------------------------------------
// MMA Intrinsics
let Predicates = [MMA] in {
def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)),
(XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)),
(XVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)),
(XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)),
(XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)),
(XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)),
(XVF64GER $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
(XVF64GERPP $ATi, $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
(XVF64GERPN $ATi, $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
(XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
(XVF64GERNN $ATi, $XA, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)),
(XVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)),
(XVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
(XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
}
// MMA Intrinsics
let Predicates = [MMA, PrefixInstrs] in {
def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk8Imm:$PMSK)),
(PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk8Imm:$PMSK)),
(PMXVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
(PMXVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk4Imm:$PMSK)),
(PMXVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMXVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMXVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK)),
(PMXVF32GER RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
(PMXVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
(PMXVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
(PMXVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
(PMXVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk2Imm:$YMSK)),
(PMXVF64GER $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
(PMXVF64GERPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk2Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
(PMXVF64GERPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk2Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
(PMXVF64GERNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk2Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
(PMXVF64GERNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk2Imm:$YMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMXVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMXVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
Msk4Imm:$XMSK, Msk4Imm:$YMSK,
Msk2Imm:$PMSK)),
(PMXVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
}
def Concats {
dag VecsToVecPair0 =
(v256i1 (INSERT_SUBREG

View File

@ -6,6 +6,412 @@
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
; Function Attrs: nofree nounwind writeonly
define dso_local void @test50(i8* nocapture readnone %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test50:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvbf16ger2 acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test50:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: xvbf16ger2 acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> %vc, <16 x i8> %vc)
%1 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %0, <512 x i1>* %1, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind writeonly
define dso_local void @test51(i8* nocapture readnone %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test51:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pmxvbf16ger2 acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test51:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: pmxvbf16ger2 acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%1 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %0, <512 x i1>* %1, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind
define dso_local void @test52(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test52:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: xvbf16ger2pp acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test52:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: xvbf16ger2pp acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind
define dso_local void @test53(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test53:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: xvbf16ger2pn acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test53:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: xvbf16ger2pn acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1>, <16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind
define dso_local void @test54(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test54:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: xvbf16ger2np acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test54:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: xvbf16ger2np acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1>, <16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind
define dso_local void @test55(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test55:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: xvbf16ger2nn acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test55:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: xvbf16ger2nn acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1>, <16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind
define dso_local void @test56(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test56:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: pmxvbf16ger2pp acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test56:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: pmxvbf16ger2pp acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind
define dso_local void @test57(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test57:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: pmxvbf16ger2pn acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test57:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: pmxvbf16ger2pn acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1>, <16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind
define dso_local void @test58(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test58:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: pmxvbf16ger2np acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test58:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: pmxvbf16ger2np acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1>, <16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind
define dso_local void @test59(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test59:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: pmxvbf16ger2nn acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test59:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: pmxvbf16ger2nn acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1>, <16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind writeonly
define dso_local void @test60(i8* nocapture readnone %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test60:
@ -25,6 +431,7 @@ entry:
store <16 x i8> %0, <16 x i8>* %1, align 16
ret void
}
; Function Attrs: nounwind readnone
declare <16 x i8> @llvm.ppc.vsx.xvcvspbf16(<16 x i8>)

View File

@ -0,0 +1,241 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
; Function Attrs: nofree nounwind writeonly
define dso_local void @test1(i8* nocapture readnone %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvi16ger2 acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test1:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: xvi16ger2 acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc)
%1 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %0, <512 x i1>* %1, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind writeonly
define dso_local void @test2(i8* nocapture readnone %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pmxvi16ger2 acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test2:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: pmxvi16ger2 acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%1 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %0, <512 x i1>* %1, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind
define dso_local void @test3(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: xvi8ger4spp acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test3:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: xvi8ger4spp acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1>, <16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind
define dso_local void @test4(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test4:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: xvi16ger2pp acc0, v2, v2
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test4:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: xvi16ger2pp acc0, v2, v2
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
; Function Attrs: nofree nounwind
define dso_local void @test5(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: pmxvi8ger4spp acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test5:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: pmxvi8ger4spp acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1>, <16 x i8>, <16 x i8>, i32, i32, i32)
; Function Attrs: nofree nounwind
define dso_local void @test6(i8* nocapture readonly %vqp, i8* nocapture readnone %vpp, <16 x i8> %vc, i8* nocapture %resp) {
; CHECK-LABEL: test6:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxv vs1, 32(r3)
; CHECK-NEXT: lxv vs0, 48(r3)
; CHECK-NEXT: lxv vs3, 0(r3)
; CHECK-NEXT: lxv vs2, 16(r3)
; CHECK-NEXT: xxmtacc acc0
; CHECK-NEXT: pmxvi16ger2pp acc0, v2, v2, 0, 0, 0
; CHECK-NEXT: xxmfacc acc0
; CHECK-NEXT: stxv vs0, 48(r7)
; CHECK-NEXT: stxv vs1, 32(r7)
; CHECK-NEXT: stxv vs2, 16(r7)
; CHECK-NEXT: stxv vs3, 0(r7)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: test6:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: xxmtacc acc0
; CHECK-BE-NEXT: pmxvi16ger2pp acc0, v2, v2, 0, 0, 0
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs1, 16(r7)
; CHECK-BE-NEXT: stxv vs0, 0(r7)
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i8* %vqp to <512 x i1>*
%1 = load <512 x i1>, <512 x i1>* %0, align 64
%2 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
%3 = bitcast i8* %resp to <512 x i1>*
store <512 x i1> %2, <512 x i1>* %3, align 64
ret void
}
; Function Attrs: nounwind readnone
declare <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>, i32, i32, i32)

File diff suppressed because it is too large Load Diff