[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics

This patch adjusts the following ARM/AArch64 LLVM IR intrinsics: - neon_bfmmla - neon_bfmlalb - neon_bfmlalt so that they take and return bf16 and float types. Previously these intrinsics used <8 x i8> and <4 x i8> vectors (a rudiment from implementation lacking bf16 IR type). The neon_vbfdot[q] intrinsics are adjusted similarly. This change required some additional selection patterns for vbfdot itself and also for vector shuffles (in a previous patch) because of SelectionDAG transformations kicking in and mangling the original code. This patch makes the generated IR cleaner (less useless bitcasts are produced), but it does not affect the final assembly. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D86146
2025-01-31 12:41:49 +01:00 · 2020-08-27 18:43:16 +01:00 · 2020-08-27 18:43:16 +01:00 · f7e914e2c5
commit f7e914e2c5
parent 5106495e6e
12 changed files with 388 additions and 170 deletions
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@ -184,6 +184,10 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                [IntrNoMem]>;

+  class AdvSIMD_BF16FML_Intrinsic
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
 }

 // Arithmetic ops
@ -466,9 +470,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
  def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
  def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
  def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
-  def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
-  def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
-  def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
+  def int_aarch64_neon_bfmmla
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+  def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
+  def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;


  // v8.6-A Bfloat Intrinsics
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@ -791,14 +791,17 @@ def int_arm_neon_vcvtbfp2bf
    : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;

 def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
-def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
+def int_arm_neon_bfmmla
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;

-class Neon_FML_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
-              [IntrNoMem]>;
-def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
-def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
+class Neon_BF16FML_Intrinsic
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
+def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;

 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -632,6 +632,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
        return true;
      }
    }
+
+    // Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
+    // respectively
+    if ((Name.startswith("arm.neon.bfdot.") ||
+         Name.startswith("aarch64.neon.bfdot.")) &&
+        Name.endswith("i8")) {
+      Intrinsic::ID IID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Cases("arm.neon.bfdot.v2f32.v8i8",
+                     "arm.neon.bfdot.v4f32.v16i8",
+                     Intrinsic::arm_neon_bfdot)
+              .Cases("aarch64.neon.bfdot.v2f32.v8i8",
+                     "aarch64.neon.bfdot.v4f32.v16i8",
+                     Intrinsic::aarch64_neon_bfdot)
+              .Default(Intrinsic::not_intrinsic);
+      if (IID == Intrinsic::not_intrinsic)
+        break;
+
+      size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
+      assert((OperandWidth == 64 || OperandWidth == 128) &&
+             "Unexpected operand width");
+      LLVMContext &Ctx = F->getParent()->getContext();
+      std::array<Type *, 2> Tys {{
+        F->getReturnType(),
+        FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
+      }};
+      NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
+      return true;
+    }
+
+    // Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
+    // and accept v8bf16 instead of v16i8
+    if ((Name.startswith("arm.neon.bfm") ||
+         Name.startswith("aarch64.neon.bfm")) &&
+        Name.endswith(".v4f32.v16i8")) {
+      Intrinsic::ID IID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Case("arm.neon.bfmmla.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmmla)
+              .Case("arm.neon.bfmlalb.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmlalb)
+              .Case("arm.neon.bfmlalt.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmlalt)
+              .Case("aarch64.neon.bfmmla.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmmla)
+              .Case("aarch64.neon.bfmlalb.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmlalb)
+              .Case("aarch64.neon.bfmlalt.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmlalt)
+              .Default(Intrinsic::not_intrinsic);
+      if (IID == Intrinsic::not_intrinsic)
+        break;
+
+      std::array<Type *, 0> Tys;
+      NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
+      return true;
+    }
    break;
  }

@ -3618,6 +3675,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
    break;
  }

+  case Intrinsic::arm_neon_bfdot:
+  case Intrinsic::arm_neon_bfmmla:
+  case Intrinsic::arm_neon_bfmlalb:
+  case Intrinsic::arm_neon_bfmlalt:
+  case Intrinsic::aarch64_neon_bfdot:
+  case Intrinsic::aarch64_neon_bfmmla:
+  case Intrinsic::aarch64_neon_bfmlalb:
+  case Intrinsic::aarch64_neon_bfmlalt: {
+    SmallVector<Value *, 3> Args;
+    assert(CI->getNumArgOperands() == 3 &&
+           "Mismatch between function args and call args");
+    size_t OperandWidth =
+        CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
+    assert((OperandWidth == 64 || OperandWidth == 128) &&
+           "Unexpected operand width");
+    Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
+    auto Iter = CI->arg_operands().begin();
+    Args.push_back(*Iter++);
+    Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
+    Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
+
  case Intrinsic::bitreverse:
    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
    break;
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@ -7841,9 +7841,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,

 multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
  def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
-                                           v2f32, v8i8>;
+                                           v2f32, v4bf16>;
  def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
-                                           v4f32, v16i8>;
+                                           v4f32, v8bf16>;
 }

 class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
@ -7861,7 +7861,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
                                 (InputType RegType:$Rn),
                                 (InputType (bitconvert (AccumType
                                    (AArch64duplane32 (v4f32 V128:$Rm),
-                                        VectorIndexH:$idx)))))))]> {
+                                        VectorIndexS:$idx)))))))]> {

  bits<2> idx;
  let Inst{21}    = idx{0};  // L
@ -7871,16 +7871,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
 multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {

  def v4bf16  : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
-                                               ".2h", V64, v2f32, v8i8>;
+                                               ".2h", V64, v2f32, v4bf16>;
  def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
-                                              ".2h", V128, v4f32, v16i8>;
+                                              ".2h", V128, v4f32, v8bf16>;
 }

 class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
  : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
              [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
-                                               (v16i8 V128:$Rn),
-                                               (v16i8 V128:$Rm)))]> {
+                                               (v8bf16 V128:$Rn),
+                                               (v8bf16 V128:$Rm)))]> {
  let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
 }

@ -7890,10 +7890,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
      "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
          [(set (v4f32 V128:$dst),
                (v4f32 (OpNode (v4f32 V128:$Rd),
-                               (v16i8 V128:$Rn),
-                               (v16i8 (bitconvert (v8bf16
+                               (v8bf16 V128:$Rn),
+                               (v8bf16
                                  (AArch64duplane16 (v8bf16 V128_lo:$Rm),
-                                      VectorIndexH:$idx)))))))]>,
+                                      VectorIndexH:$idx)))))]>,
    Sched<[WriteV]> {
  bits<5> Rd;
  bits<5> Rn;
@ -7917,8 +7917,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
                                V128, asm, ".4s",
                          [(set (v4f32 V128:$dst),
                                (int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
-                                                         (v16i8 V128:$Rn),
-                                                         (v16i8 V128:$Rm)))]> {
+                                                         (v8bf16 V128:$Rn),
+                                                         (v8bf16 V128:$Rm)))]> {
  let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
                                    ", $Rm", ".8h", "}");
 }
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -798,6 +798,23 @@ def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
 def BFCVTN       : SIMD_BFCVTN;
 def BFCVTN2      : SIMD_BFCVTN2;
 def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
+
+// Vector-scalar BFDOT:
+// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
+// register (the instruction uses a single 32-bit lane from it), so the pattern
+// is a bit tricky.
+def : Pat<(v2f32 (int_aarch64_neon_bfdot
+                    (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+                    (v4bf16 (bitconvert
+                      (v2i32 (AArch64duplane32
+                        (v4i32 (bitconvert
+                          (v8bf16 (insert_subvector undef,
+                            (v4bf16 V64:$Rm),
+                            (i64 0))))),
+                        VectorIndexS:$idx)))))),
+          (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+                             (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                             VectorIndexS:$idx)>;
 }

 // ARMv8.6A AArch64 matrix multiplication
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@ -9079,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
    (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
 }

-def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
-def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;

-defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
-defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;

 class BF16MM<bit Q, RegisterClass RegTy,
             string opc>
@ -9091,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy,
           (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
           N3RegFrm, IIC_VDOTPROD, "", "",
                [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
-                                                (v16i8 QPR:$Vn),
-                                                (v16i8 QPR:$Vm)))]> {
+                                                (v8bf16 QPR:$Vn),
+                                                (v8bf16 QPR:$Vm)))]> {
   let Constraints = "$dst = $Vd";
   let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
   let DecoderNamespace = "VFPV8";
@ -9106,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
           NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
                [(set (v4f32 QPR:$dst),
                      (OpNode (v4f32 QPR:$Vd),
-                              (v16i8 QPR:$Vn),
-                              (v16i8 QPR:$Vm)))]> {
+                              (v8bf16 QPR:$Vn),
+                              (v8bf16 QPR:$Vm)))]> {
  let Constraints = "$dst = $Vd";
  let DecoderNamespace = "VFPV8";
 }
@ -9128,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {

  def : Pat<
    (v4f32 (OpNode (v4f32 QPR:$Vd),
-                   (v16i8 QPR:$Vn),
-                   (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
-                                                           VectorIndex16:$lane)))))),
+                   (v8bf16 QPR:$Vn),
+                   (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+                            VectorIndex16:$lane)))),
    (!cast<Instruction>(NAME) QPR:$Vd,
                              QPR:$Vn,
                              (EXTRACT_SUBREG QPR:$Vm,
--- a/test/Bitcode/aarch64-bf16-upgrade.ll
+++ b/test/Bitcode/aarch64-bf16-upgrade.ll
@ -0,0 +1,76 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Bitcode was generated from file below
+
+define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdot_f32
+entry:
+  %0 = bitcast <4 x bfloat> %a to <8 x i8>
+  %1 = bitcast <4 x bfloat> %b to <8 x i8>
+  ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
+  %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
+  ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdotq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ret <4 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmmlaq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmmla1.i
+}
+
+define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalt1.i
+}
+
+declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
+; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
--- a/test/Bitcode/aarch64-bf16-upgrade.ll.bc
+++ b/test/Bitcode/aarch64-bf16-upgrade.ll.bc
--- a/test/Bitcode/arm-bf16-upgrade.ll
+++ b/test/Bitcode/arm-bf16-upgrade.ll
@ -0,0 +1,76 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Bitcode was generated from file below
+
+define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdot_f32
+entry:
+  %0 = bitcast <4 x bfloat> %a to <8 x i8>
+  %1 = bitcast <4 x bfloat> %b to <8 x i8>
+  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
+  ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
+  ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdotq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmmlaq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmmla1.i
+}
+
+define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalt1.i
+}
+
+declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
+; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
--- a/test/Bitcode/arm-bf16-upgrade.ll.bc
+++ b/test/Bitcode/arm-bf16-upgrade.ll.bc
--- a/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll
+++ b/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll
@ -7,10 +7,8 @@ define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat
 ; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <4 x bfloat> %a to <8 x i8>
-  %1 = bitcast <4 x bfloat> %b to <8 x i8>
-  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
-  ret <2 x float> %vbfdot1.i
+  %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b)
+  ret <2 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -19,24 +17,22 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
 ; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfdot1.i
+  %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfdot3.i
 }

 define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfdot_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfdot v0.2s, v1.4h, v2.2h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.2h[0]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
+  ret <2 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -45,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
 ; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.2h[3]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
+  ret <4 x float> %vbfdot3.i
 }

 define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
@ -59,26 +54,25 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
 ; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.2h[3]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
+  ret <2 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfdotq_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfdot v0.4s, v1.8h, v2.2h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.2h[0]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
+  ret <4 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -87,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
 ; CHECK-NEXT:    bfmmla v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmmla1.i
+  %vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmmlaq_v3.i
 }

 define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -99,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
 ; CHECK-NEXT:    bfmlalb v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlalbq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -111,23 +101,20 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
 ; CHECK-NEXT:    bfmlalt v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlaltq_v3.i
 }

 define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfmlalbq_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfmlalb v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfmlalb v0.4s, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }

 define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -137,23 +124,20 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
 ; CHECK-NEXT:    ret
 entry:
  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfmlaltq_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfmlalt v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfmlalt v0.4s, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -163,14 +147,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
 ; CHECK-NEXT:    ret
 entry:
  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }

-declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
+declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
--- a/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
+++ b/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
@ -7,10 +7,8 @@ define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat>
 ; CHECK-NEXT:    vdot.bf16 d0, d1, d2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <4 x bfloat> %a to <8 x i8>
-  %1 = bitcast <4 x bfloat> %b to <8 x i8>
-  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
-  ret <2 x float> %vbfdot1.i
+  %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
+  ret <2 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -19,10 +17,8 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
 ; CHECK-NEXT:    vdot.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfdot1.i
+  %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
+  ret <4 x float> %vbfdot3.i
 }

 define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
@ -31,12 +27,11 @@ define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x b
 ; CHECK-NEXT:    vdot.bf16 d0, d1, d2[0]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
+  ret <2 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -46,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
 ; CHECK-NEXT:    vdot.bf16 q0, q1, q8
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
+  ret <4 x float> %vbfdot3.i
 }

 define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
@ -60,12 +54,11 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
 ; CHECK-NEXT:    vdot.bf16 d0, d1, d3[1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
+  ret <2 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@ -75,12 +68,11 @@ define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x
 ; CHECK-NEXT:    vdot.bf16 q0, q1, d4[0]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
+  ret <4 x float> %vbfdot3.i
 }

 define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -89,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
 ; CHECK-NEXT:    vmmla.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmmla1.i
+  %vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmmlaq_v3.i
 }

 define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -101,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
 ; CHECK-NEXT:    vfmab.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlalbq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -113,10 +101,8 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
 ; CHECK-NEXT:    vfmat.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlaltq_v3.i
 }

 define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@ -127,10 +113,8 @@ define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
 ; CHECK-NEXT:    bx lr
 entry:
  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }

 define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -140,10 +124,8 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
 ; CHECK-NEXT:    bx lr
 entry:
  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@ -154,10 +136,8 @@ define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
 ; CHECK-NEXT:    bx lr
 entry:
  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -167,10 +147,8 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
 ; CHECK-NEXT:    bx lr
 entry:
  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }

 define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -181,14 +159,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a,
 ; CHECK-NEXT:    bx lr
 entry:
  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
  ret <4 x float> %vbfmlalt1.i
 }

-declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
-declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)