mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics
This patch adjusts the following ARM/AArch64 LLVM IR intrinsics: - neon_bfmmla - neon_bfmlalb - neon_bfmlalt so that they take and return bf16 and float types. Previously these intrinsics used <8 x i8> and <4 x i8> vectors (a rudiment from implementation lacking bf16 IR type). The neon_vbfdot[q] intrinsics are adjusted similarly. This change required some additional selection patterns for vbfdot itself and also for vector shuffles (in a previous patch) because of SelectionDAG transformations kicking in and mangling the original code. This patch makes the generated IR cleaner (less useless bitcasts are produced), but it does not affect the final assembly. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D86146
This commit is contained in:
parent
5106495e6e
commit
f7e914e2c5
@ -184,6 +184,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
|
|||||||
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
||||||
[IntrNoMem]>;
|
[IntrNoMem]>;
|
||||||
|
|
||||||
|
class AdvSIMD_BF16FML_Intrinsic
|
||||||
|
: Intrinsic<[llvm_v4f32_ty],
|
||||||
|
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||||
|
[IntrNoMem]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Arithmetic ops
|
// Arithmetic ops
|
||||||
@ -466,9 +470,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
|
|||||||
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
|
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
|
||||||
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
|
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
|
||||||
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
|
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
|
||||||
def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
|
def int_aarch64_neon_bfmmla
|
||||||
def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
|
: Intrinsic<[llvm_v4f32_ty],
|
||||||
def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
|
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||||
|
[IntrNoMem]>;
|
||||||
|
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
|
||||||
|
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
|
||||||
|
|
||||||
|
|
||||||
// v8.6-A Bfloat Intrinsics
|
// v8.6-A Bfloat Intrinsics
|
||||||
|
@ -791,14 +791,17 @@ def int_arm_neon_vcvtbfp2bf
|
|||||||
: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
|
: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
|
||||||
|
|
||||||
def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
|
def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
|
||||||
def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
|
def int_arm_neon_bfmmla
|
||||||
|
: Intrinsic<[llvm_v4f32_ty],
|
||||||
|
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||||
|
[IntrNoMem]>;
|
||||||
|
|
||||||
class Neon_FML_Intrinsic
|
class Neon_BF16FML_Intrinsic
|
||||||
: Intrinsic<[llvm_anyvector_ty],
|
: Intrinsic<[llvm_v4f32_ty],
|
||||||
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||||
[IntrNoMem]>;
|
[IntrNoMem]>;
|
||||||
def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
|
def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
|
||||||
def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
|
def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;
|
||||||
|
|
||||||
def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
|
def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
|
||||||
def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
|
def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
|
||||||
|
@ -632,6 +632,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
|
||||||
|
// respectively
|
||||||
|
if ((Name.startswith("arm.neon.bfdot.") ||
|
||||||
|
Name.startswith("aarch64.neon.bfdot.")) &&
|
||||||
|
Name.endswith("i8")) {
|
||||||
|
Intrinsic::ID IID =
|
||||||
|
StringSwitch<Intrinsic::ID>(Name)
|
||||||
|
.Cases("arm.neon.bfdot.v2f32.v8i8",
|
||||||
|
"arm.neon.bfdot.v4f32.v16i8",
|
||||||
|
Intrinsic::arm_neon_bfdot)
|
||||||
|
.Cases("aarch64.neon.bfdot.v2f32.v8i8",
|
||||||
|
"aarch64.neon.bfdot.v4f32.v16i8",
|
||||||
|
Intrinsic::aarch64_neon_bfdot)
|
||||||
|
.Default(Intrinsic::not_intrinsic);
|
||||||
|
if (IID == Intrinsic::not_intrinsic)
|
||||||
|
break;
|
||||||
|
|
||||||
|
size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
|
||||||
|
assert((OperandWidth == 64 || OperandWidth == 128) &&
|
||||||
|
"Unexpected operand width");
|
||||||
|
LLVMContext &Ctx = F->getParent()->getContext();
|
||||||
|
std::array<Type *, 2> Tys {{
|
||||||
|
F->getReturnType(),
|
||||||
|
FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
|
||||||
|
}};
|
||||||
|
NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
|
||||||
|
// and accept v8bf16 instead of v16i8
|
||||||
|
if ((Name.startswith("arm.neon.bfm") ||
|
||||||
|
Name.startswith("aarch64.neon.bfm")) &&
|
||||||
|
Name.endswith(".v4f32.v16i8")) {
|
||||||
|
Intrinsic::ID IID =
|
||||||
|
StringSwitch<Intrinsic::ID>(Name)
|
||||||
|
.Case("arm.neon.bfmmla.v4f32.v16i8",
|
||||||
|
Intrinsic::arm_neon_bfmmla)
|
||||||
|
.Case("arm.neon.bfmlalb.v4f32.v16i8",
|
||||||
|
Intrinsic::arm_neon_bfmlalb)
|
||||||
|
.Case("arm.neon.bfmlalt.v4f32.v16i8",
|
||||||
|
Intrinsic::arm_neon_bfmlalt)
|
||||||
|
.Case("aarch64.neon.bfmmla.v4f32.v16i8",
|
||||||
|
Intrinsic::aarch64_neon_bfmmla)
|
||||||
|
.Case("aarch64.neon.bfmlalb.v4f32.v16i8",
|
||||||
|
Intrinsic::aarch64_neon_bfmlalb)
|
||||||
|
.Case("aarch64.neon.bfmlalt.v4f32.v16i8",
|
||||||
|
Intrinsic::aarch64_neon_bfmlalt)
|
||||||
|
.Default(Intrinsic::not_intrinsic);
|
||||||
|
if (IID == Intrinsic::not_intrinsic)
|
||||||
|
break;
|
||||||
|
|
||||||
|
std::array<Type *, 0> Tys;
|
||||||
|
NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3618,6 +3675,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case Intrinsic::arm_neon_bfdot:
|
||||||
|
case Intrinsic::arm_neon_bfmmla:
|
||||||
|
case Intrinsic::arm_neon_bfmlalb:
|
||||||
|
case Intrinsic::arm_neon_bfmlalt:
|
||||||
|
case Intrinsic::aarch64_neon_bfdot:
|
||||||
|
case Intrinsic::aarch64_neon_bfmmla:
|
||||||
|
case Intrinsic::aarch64_neon_bfmlalb:
|
||||||
|
case Intrinsic::aarch64_neon_bfmlalt: {
|
||||||
|
SmallVector<Value *, 3> Args;
|
||||||
|
assert(CI->getNumArgOperands() == 3 &&
|
||||||
|
"Mismatch between function args and call args");
|
||||||
|
size_t OperandWidth =
|
||||||
|
CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
|
||||||
|
assert((OperandWidth == 64 || OperandWidth == 128) &&
|
||||||
|
"Unexpected operand width");
|
||||||
|
Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
|
||||||
|
auto Iter = CI->arg_operands().begin();
|
||||||
|
Args.push_back(*Iter++);
|
||||||
|
Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
|
||||||
|
Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
|
||||||
|
NewCall = Builder.CreateCall(NewFn, Args);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case Intrinsic::bitreverse:
|
case Intrinsic::bitreverse:
|
||||||
NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
|
NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
|
||||||
break;
|
break;
|
||||||
|
@ -7841,9 +7841,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
|
|||||||
|
|
||||||
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
|
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
|
||||||
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
|
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
|
||||||
v2f32, v8i8>;
|
v2f32, v4bf16>;
|
||||||
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
|
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
|
||||||
v4f32, v16i8>;
|
v4f32, v8bf16>;
|
||||||
}
|
}
|
||||||
|
|
||||||
class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
||||||
@ -7861,7 +7861,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
|||||||
(InputType RegType:$Rn),
|
(InputType RegType:$Rn),
|
||||||
(InputType (bitconvert (AccumType
|
(InputType (bitconvert (AccumType
|
||||||
(AArch64duplane32 (v4f32 V128:$Rm),
|
(AArch64duplane32 (v4f32 V128:$Rm),
|
||||||
VectorIndexH:$idx)))))))]> {
|
VectorIndexS:$idx)))))))]> {
|
||||||
|
|
||||||
bits<2> idx;
|
bits<2> idx;
|
||||||
let Inst{21} = idx{0}; // L
|
let Inst{21} = idx{0}; // L
|
||||||
@ -7871,16 +7871,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
|||||||
multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
|
multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
|
||||||
|
|
||||||
def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
|
def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
|
||||||
".2h", V64, v2f32, v8i8>;
|
".2h", V64, v2f32, v4bf16>;
|
||||||
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
|
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
|
||||||
".2h", V128, v4f32, v16i8>;
|
".2h", V128, v4f32, v8bf16>;
|
||||||
}
|
}
|
||||||
|
|
||||||
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
|
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
|
||||||
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
|
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
|
||||||
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
|
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
|
||||||
(v16i8 V128:$Rn),
|
(v8bf16 V128:$Rn),
|
||||||
(v16i8 V128:$Rm)))]> {
|
(v8bf16 V128:$Rm)))]> {
|
||||||
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
|
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -7890,10 +7890,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
|
|||||||
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
|
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
|
||||||
[(set (v4f32 V128:$dst),
|
[(set (v4f32 V128:$dst),
|
||||||
(v4f32 (OpNode (v4f32 V128:$Rd),
|
(v4f32 (OpNode (v4f32 V128:$Rd),
|
||||||
(v16i8 V128:$Rn),
|
(v8bf16 V128:$Rn),
|
||||||
(v16i8 (bitconvert (v8bf16
|
(v8bf16
|
||||||
(AArch64duplane16 (v8bf16 V128_lo:$Rm),
|
(AArch64duplane16 (v8bf16 V128_lo:$Rm),
|
||||||
VectorIndexH:$idx)))))))]>,
|
VectorIndexH:$idx)))))]>,
|
||||||
Sched<[WriteV]> {
|
Sched<[WriteV]> {
|
||||||
bits<5> Rd;
|
bits<5> Rd;
|
||||||
bits<5> Rn;
|
bits<5> Rn;
|
||||||
@ -7917,8 +7917,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
|
|||||||
V128, asm, ".4s",
|
V128, asm, ".4s",
|
||||||
[(set (v4f32 V128:$dst),
|
[(set (v4f32 V128:$dst),
|
||||||
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
|
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
|
||||||
(v16i8 V128:$Rn),
|
(v8bf16 V128:$Rn),
|
||||||
(v16i8 V128:$Rm)))]> {
|
(v8bf16 V128:$Rm)))]> {
|
||||||
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
|
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
|
||||||
", $Rm", ".8h", "}");
|
", $Rm", ".8h", "}");
|
||||||
}
|
}
|
||||||
|
@ -798,6 +798,23 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
|
|||||||
def BFCVTN : SIMD_BFCVTN;
|
def BFCVTN : SIMD_BFCVTN;
|
||||||
def BFCVTN2 : SIMD_BFCVTN2;
|
def BFCVTN2 : SIMD_BFCVTN2;
|
||||||
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
|
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
|
||||||
|
|
||||||
|
// Vector-scalar BFDOT:
|
||||||
|
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
|
||||||
|
// register (the instruction uses a single 32-bit lane from it), so the pattern
|
||||||
|
// is a bit tricky.
|
||||||
|
def : Pat<(v2f32 (int_aarch64_neon_bfdot
|
||||||
|
(v2f32 V64:$Rd), (v4bf16 V64:$Rn),
|
||||||
|
(v4bf16 (bitconvert
|
||||||
|
(v2i32 (AArch64duplane32
|
||||||
|
(v4i32 (bitconvert
|
||||||
|
(v8bf16 (insert_subvector undef,
|
||||||
|
(v4bf16 V64:$Rm),
|
||||||
|
(i64 0))))),
|
||||||
|
VectorIndexS:$idx)))))),
|
||||||
|
(BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
|
||||||
|
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
|
||||||
|
VectorIndexS:$idx)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ARMv8.6A AArch64 matrix multiplication
|
// ARMv8.6A AArch64 matrix multiplication
|
||||||
|
@ -9079,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
|
|||||||
(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
|
(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
|
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
|
||||||
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
|
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;
|
||||||
|
|
||||||
defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
|
defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
|
||||||
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
|
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
|
||||||
|
|
||||||
class BF16MM<bit Q, RegisterClass RegTy,
|
class BF16MM<bit Q, RegisterClass RegTy,
|
||||||
string opc>
|
string opc>
|
||||||
@ -9091,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy,
|
|||||||
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
|
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
|
||||||
N3RegFrm, IIC_VDOTPROD, "", "",
|
N3RegFrm, IIC_VDOTPROD, "", "",
|
||||||
[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
|
[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
|
||||||
(v16i8 QPR:$Vn),
|
(v8bf16 QPR:$Vn),
|
||||||
(v16i8 QPR:$Vm)))]> {
|
(v8bf16 QPR:$Vm)))]> {
|
||||||
let Constraints = "$dst = $Vd";
|
let Constraints = "$dst = $Vd";
|
||||||
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
|
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
|
||||||
let DecoderNamespace = "VFPV8";
|
let DecoderNamespace = "VFPV8";
|
||||||
@ -9106,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
|
|||||||
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
|
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
|
||||||
[(set (v4f32 QPR:$dst),
|
[(set (v4f32 QPR:$dst),
|
||||||
(OpNode (v4f32 QPR:$Vd),
|
(OpNode (v4f32 QPR:$Vd),
|
||||||
(v16i8 QPR:$Vn),
|
(v8bf16 QPR:$Vn),
|
||||||
(v16i8 QPR:$Vm)))]> {
|
(v8bf16 QPR:$Vm)))]> {
|
||||||
let Constraints = "$dst = $Vd";
|
let Constraints = "$dst = $Vd";
|
||||||
let DecoderNamespace = "VFPV8";
|
let DecoderNamespace = "VFPV8";
|
||||||
}
|
}
|
||||||
@ -9128,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
|
|||||||
|
|
||||||
def : Pat<
|
def : Pat<
|
||||||
(v4f32 (OpNode (v4f32 QPR:$Vd),
|
(v4f32 (OpNode (v4f32 QPR:$Vd),
|
||||||
(v16i8 QPR:$Vn),
|
(v8bf16 QPR:$Vn),
|
||||||
(v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
|
(v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
|
||||||
VectorIndex16:$lane)))))),
|
VectorIndex16:$lane)))),
|
||||||
(!cast<Instruction>(NAME) QPR:$Vd,
|
(!cast<Instruction>(NAME) QPR:$Vd,
|
||||||
QPR:$Vn,
|
QPR:$Vn,
|
||||||
(EXTRACT_SUBREG QPR:$Vm,
|
(EXTRACT_SUBREG QPR:$Vm,
|
||||||
|
76
test/Bitcode/aarch64-bf16-upgrade.ll
Normal file
76
test/Bitcode/aarch64-bf16-upgrade.ll
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
; RUN: llvm-dis < %s.bc | FileCheck %s
|
||||||
|
|
||||||
|
; Bitcode was generated from file below
|
||||||
|
|
||||||
|
define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfdot_f32
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||||
|
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||||
|
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
|
||||||
|
%vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||||
|
ret <2 x float> %vbfdot1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfdotq_f32
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
%vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
ret <4 x float> %vbfdot1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfmmlaq_f32
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||||
|
%vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfmmla1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
|
||||||
|
entry:
|
||||||
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||||
|
%vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfmlalb1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
|
||||||
|
entry:
|
||||||
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||||
|
%vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfmlalt1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
|
||||||
|
; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
0
test/Bitcode/aarch64-bf16-upgrade.ll.bc
Normal file
0
test/Bitcode/aarch64-bf16-upgrade.ll.bc
Normal file
76
test/Bitcode/arm-bf16-upgrade.ll
Normal file
76
test/Bitcode/arm-bf16-upgrade.ll
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
; RUN: llvm-dis < %s.bc | FileCheck %s
|
||||||
|
|
||||||
|
; Bitcode was generated from file below
|
||||||
|
|
||||||
|
define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfdot_f32
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||||
|
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||||
|
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
|
||||||
|
ret <2 x float> %vbfdot1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfdotq_f32
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||||
|
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfdot1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfmmlaq_f32
|
||||||
|
entry:
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||||
|
%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfmmla1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
|
||||||
|
entry:
|
||||||
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||||
|
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfmlalb1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
|
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
|
||||||
|
entry:
|
||||||
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
|
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||||
|
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||||
|
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||||
|
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||||
|
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||||
|
ret <4 x float> %vbfmlalt1.i
|
||||||
|
}
|
||||||
|
|
||||||
|
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
|
||||||
|
; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||||
|
; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
0
test/Bitcode/arm-bf16-upgrade.ll.bc
Normal file
0
test/Bitcode/arm-bf16-upgrade.ll.bc
Normal file
@ -7,10 +7,8 @@ define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat
|
|||||||
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h
|
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b)
|
||||||
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
ret <2 x float> %vbfdot3.i
|
||||||
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
|
||||||
ret <2 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -19,24 +17,22 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
|
|||||||
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h
|
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfdot3.i
|
||||||
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
; CHECK-LABEL: test_vbfdot_lane_f32:
|
; CHECK-LABEL: test_vbfdot_lane_f32:
|
||||||
; CHECK: // %bb.0: // %entry
|
; CHECK: // %bb.0: // %entry
|
||||||
; CHECK: bfdot v0.2s, v1.4h, v2.2h[0]
|
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||||
|
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[0]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
|
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
|
||||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
|
||||||
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
ret <2 x float> %vbfdot3.i
|
||||||
ret <2 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -45,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
|
|||||||
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]
|
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
|
||||||
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
ret <4 x float> %vbfdot3.i
|
||||||
ret <4 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -59,26 +54,25 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
|
|||||||
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]
|
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
|
||||||
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
ret <2 x float> %vbfdot3.i
|
||||||
ret <2 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
; CHECK-LABEL: test_vbfdotq_lane_f32:
|
; CHECK-LABEL: test_vbfdotq_lane_f32:
|
||||||
; CHECK: // %bb.0: // %entry
|
; CHECK: // %bb.0: // %entry
|
||||||
; CHECK: bfdot v0.4s, v1.8h, v2.2h[0]
|
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||||
|
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[0]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
|
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
|
||||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
|
||||||
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
ret <4 x float> %vbfdot3.i
|
||||||
ret <4 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -87,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
|
|||||||
; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h
|
; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfmmlaq_v3.i
|
||||||
%vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmmla1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -99,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||||||
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h
|
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfmlalbq_v3.i
|
||||||
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalb1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -111,23 +101,20 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||||||
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h
|
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfmlaltq_v3.i
|
||||||
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
; CHECK-LABEL: test_vbfmlalbq_lane_f32:
|
; CHECK-LABEL: test_vbfmlalbq_lane_f32:
|
||||||
; CHECK: // %bb.0: // %entry
|
; CHECK: // %bb.0: // %entry
|
||||||
; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0]
|
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||||
|
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlalbq_v3.i
|
||||||
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalb1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -137,23 +124,20 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlalbq_v3.i
|
||||||
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalb1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
; CHECK-LABEL: test_vbfmlaltq_lane_f32:
|
; CHECK-LABEL: test_vbfmlaltq_lane_f32:
|
||||||
; CHECK: // %bb.0: // %entry
|
; CHECK: // %bb.0: // %entry
|
||||||
; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0]
|
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||||
|
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlaltq_v3.i
|
||||||
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -163,14 +147,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlaltq_v3.i
|
||||||
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
|
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||||
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
@ -7,10 +7,8 @@ define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat>
|
|||||||
; CHECK-NEXT: vdot.bf16 d0, d1, d2
|
; CHECK-NEXT: vdot.bf16 d0, d1, d2
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
|
||||||
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
ret <2 x float> %vbfdot3.i
|
||||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
|
||||||
ret <2 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -19,10 +17,8 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
|
|||||||
; CHECK-NEXT: vdot.bf16 q0, q1, q2
|
; CHECK-NEXT: vdot.bf16 q0, q1, q2
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfdot3.i
|
||||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
@ -31,12 +27,11 @@ define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x b
|
|||||||
; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
|
; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
|
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
|
||||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
|
||||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
ret <2 x float> %vbfdot3.i
|
||||||
ret <2 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -46,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
|
|||||||
; CHECK-NEXT: vdot.bf16 q0, q1, q8
|
; CHECK-NEXT: vdot.bf16 q0, q1, q8
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
|
||||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
ret <4 x float> %vbfdot3.i
|
||||||
ret <4 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -60,12 +54,11 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
|
|||||||
; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
|
; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
|
||||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
ret <2 x float> %vbfdot3.i
|
||||||
ret <2 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
@ -75,12 +68,11 @@ define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x
|
|||||||
; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
|
; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
|
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
|
||||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
|
||||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
ret <4 x float> %vbfdot3.i
|
||||||
ret <4 x float> %vbfdot1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -89,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
|
|||||||
; CHECK-NEXT: vmmla.bf16 q0, q1, q2
|
; CHECK-NEXT: vmmla.bf16 q0, q1, q2
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfmmlaq_v3.i
|
||||||
%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmmla1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -101,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||||||
; CHECK-NEXT: vfmab.bf16 q0, q1, q2
|
; CHECK-NEXT: vfmab.bf16 q0, q1, q2
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfmlalbq_v3.i
|
||||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalb1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -113,10 +101,8 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||||||
; CHECK-NEXT: vfmat.bf16 q0, q1, q2
|
; CHECK-NEXT: vfmat.bf16 q0, q1, q2
|
||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
ret <4 x float> %vbfmlaltq_v3.i
|
||||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
@ -127,10 +113,8 @@ define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
|
|||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlalbq_v3.i
|
||||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalb1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -140,10 +124,8 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlalbq_v3.i
|
||||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalb1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||||
@ -154,10 +136,8 @@ define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
|
|||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlaltq_v3.i
|
||||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -167,10 +147,8 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
ret <4 x float> %vbfmlaltq_v3.i
|
||||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||||
@ -181,14 +159,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a,
|
|||||||
; CHECK-NEXT: bx lr
|
; CHECK-NEXT: bx lr
|
||||||
entry:
|
entry:
|
||||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
|
||||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
|
||||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
|
||||||
ret <4 x float> %vbfmlalt1.i
|
ret <4 x float> %vbfmlalt1.i
|
||||||
}
|
}
|
||||||
|
|
||||||
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
|
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||||
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user