[sve][acle] Implement some of the C intrinsics for brain float.

Summary: The following intrinsics have been extended to support brain float types: svbfloat16_t svclasta[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) bfloat16_t svclasta[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t data) bfloat16_t svlasta[_bf16](svbool_t pg, svbfloat16_t op) svbfloat16_t svclastb[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) bfloat16_t svclastb[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t data) bfloat16_t svlastb[_bf16](svbool_t pg, svbfloat16_t op) svbfloat16_t svdup[_n]_bf16(bfloat16_t op) svbfloat16_t svdup[_n]_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op) svbfloat16_t svdup[_n]_bf16_x(svbool_t pg, bfloat16_t op) svbfloat16_t svdup[_n]_bf16_z(svbool_t pg, bfloat16_t op) svbfloat16_t svdupq[_n]_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3, bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) svbfloat16_t svdupq_lane[_bf16](svbfloat16_t data, uint64_t index) svbfloat16_t svinsr[_n_bf16](svbfloat16_t op1, bfloat16_t op2) Reviewers: sdesmalen, kmclaughlin, c-rhodes, ctetreau, efriedma Subscribers: tschuett, hiraditya, rkruppe, psnobl, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D82345
2024-11-23 11:13:28 +01:00 · 2020-06-19 21:27:46 +00:00 · 2020-06-19 21:27:46 +00:00 · b0f83ed2ae
commit b0f83ed2ae
parent 2c3749142b
6 changed files with 206 additions and 0 deletions
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -423,6 +423,11 @@ let Predicates = [HasSVE] in {
  defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
  defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;

+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
+              (CPY_ZPmV_H $passthru, $pg, $splat)>;
+  }
+
  // Duplicate FP scalar into all vector elements
  def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
@ -436,6 +441,10 @@ let Predicates = [HasSVE] in {
            (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
  def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
            (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+              (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  }

  // Duplicate +0.0 into all vector elements
  def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
@ -444,6 +453,9 @@ let Predicates = [HasSVE] in {
  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  }

  // Duplicate Int immediate into all vector elements
  def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
@ -486,6 +498,10 @@ let Predicates = [HasSVE] in {
  defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
  defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;

+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
+  }
+
  defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
  defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
@ -554,11 +570,23 @@ let Predicates = [HasSVE] in {
  defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
  defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;

+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_3_Op_Pat<bf16,     AArch64clasta_n,        nxv8i1, bf16,     nxv8bf16, CLASTA_VPZ_H>;
+    def : SVE_3_Op_Pat<bf16,     AArch64clastb_n,        nxv8i1, bf16,     nxv8bf16, CLASTB_VPZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
+  }
+
  defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
  defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
  defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
  defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;

+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
+    def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
+  }
+
  // continuous load with reg+immediate
  defm LD1B_IMM    : sve_mem_cld_si<0b0000, "ld1b",  Z_b, ZPR8>;
  defm LD1B_H_IMM  : sve_mem_cld_si<0b0001, "ld1b",  Z_h, ZPR16>;
@ -1499,6 +1527,13 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
    def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
    def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
    def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
+  }
+
+  let Predicates = [IsLE, HasBF16, HasSVE] in {
+    def : Pat<(nxv2i64  (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
  }

  let Predicates = [IsLE, HasSVE, HasBF16] in {
--- a/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ b/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@ -81,6 +81,14 @@ define <vscale x 8 x half> @dup_f16(half %b) {
  ret <vscale x 8 x half> %out
 }

+define <vscale x 8 x bfloat> @dup_bf16(bfloat %b) #0 {
+; CHECK-LABEL: dup_bf16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 8 x half> @dup_imm_f16(half %b) {
 ; CHECK-LABEL: dup_imm_f16:
 ; CHECK: mov z0.h, #16.00000000
@ -126,5 +134,9 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@ -57,6 +57,16 @@ define <vscale x 8 x half> @clasta_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half
  ret <vscale x 8 x half> %out
 }

+define <vscale x 8 x bfloat> @clasta_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clasta_bf16:
+; CHECK: clasta z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @clasta_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clasta_f32:
 ; CHECK: clasta z0.s, p0, z0.s, z1.s
@ -131,6 +141,16 @@ define half @clasta_n_f16(<vscale x 8 x i1> %pg, half %a, <vscale x 8 x half> %b
  ret half %out
 }

+define bfloat @clasta_n_bf16(<vscale x 8 x i1> %pg, bfloat %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clasta_n_bf16:
+; CHECK: clasta h0, p0, h0, z1.h
+; CHECK-NEXT: ret
+  %out = call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                         bfloat %a,
+                                                         <vscale x 8 x bfloat> %b)
+  ret bfloat %out
+}
+
 define float @clasta_n_f32(<vscale x 4 x i1> %pg, float %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clasta_n_f32:
 ; CHECK: clasta s0, p0, s0, z1.s
@ -205,6 +225,16 @@ define <vscale x 8 x half> @clastb_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half
  ret <vscale x 8 x half> %out
 }

+define <vscale x 8 x bfloat> @clastb_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clastb_bf16:
+; CHECK: clastb z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @clastb_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clastb_f32:
 ; CHECK: clastb z0.s, p0, z0.s, z1.s
@ -279,6 +309,16 @@ define half @clastb_n_f16(<vscale x 8 x i1> %pg, half %a, <vscale x 8 x half> %b
  ret half %out
 }

+define bfloat @clastb_n_bf16(<vscale x 8 x i1> %pg, bfloat %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clastb_n_bf16:
+; CHECK: clastb h0, p0, h0, z1.h
+; CHECK-NEXT: ret
+  %out = call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                         bfloat %a,
+                                                         <vscale x 8 x bfloat> %b)
+  ret bfloat %out
+}
+
 define float @clastb_n_f32(<vscale x 4 x i1> %pg, float %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clastb_n_f32:
 ; CHECK: clastb s0, p0, s0, z1.s
@ -343,6 +383,14 @@ define <vscale x 8 x half> @dupq_f16(<vscale x 8 x half> %a) {
  ret <vscale x 8 x half> %out
 }

+define <vscale x 8 x bfloat> @dupq_bf16(<vscale x 8 x bfloat> %a) #0 {
+; CHECK-LABEL: dupq_bf16:
+; CHECK: mov z0.q, q0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %a, i64 0)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @dupq_f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: dupq_f32:
 ; CHECK: mov z0.q, z0.q[1]
@ -432,6 +480,20 @@ define <vscale x 8 x half> @dupq_lane_f16(<vscale x 8 x half> %a, i64 %idx) {
  ret <vscale x 8 x half> %out
 }

+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 8 x bfloat> @dupq_lane_bf16(<vscale x 8 x bfloat> %a, i64 %idx) #0 {
+; CHECK-LABEL: dupq_lane_bf16:
+; CHECK-DAG:  index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG:  and   [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG:  add   [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG:  mov   [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK:      add   [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %a, i64 %idx)
+  ret <vscale x 8 x bfloat> %out
+}
+
 ; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
 define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_f32:
@ -605,6 +667,15 @@ define half @lasta_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
  ret half %res
 }

+define bfloat @lasta_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a) #0 {
+; CHECK-LABEL: lasta_bf16
+; CHECK: lasta h0, p0, z0.h
+; CHECK-NEXT: ret
+  %res = call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                      <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
 define float @lasta_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: lasta_f32
 ; CHECK: lasta s0, p0, z0.s
@ -681,6 +752,15 @@ define half @lastb_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
  ret half %res
 }

+define bfloat @lastb_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a) #0 {
+; CHECK-LABEL: lastb_bf16
+; CHECK: lastb h0, p0, z0.h
+; CHECK-NEXT: ret
+  %res = call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                      <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
 define float @lastb_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: lastb_f32
 ; CHECK: lastb s0, p0, z0.s
@ -1851,6 +1931,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.clasta.nxv8i16(<vscale x 8 x i1>, <
 declare <vscale x 4 x i32> @llvm.aarch64.sve.clasta.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.clasta.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 8 x half> @llvm.aarch64.sve.clasta.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.clasta.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.clasta.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)

@ -1859,6 +1940,7 @@ declare i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1>, i16, <vscale x
 declare i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1>, half, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1>, bfloat, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1>, float, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1>, double, <vscale x 2 x double>)

@ -1867,6 +1949,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.clastb.nxv8i16(<vscale x 8 x i1>, <
 declare <vscale x 4 x i32> @llvm.aarch64.sve.clastb.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.clastb.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 8 x half> @llvm.aarch64.sve.clastb.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.clastb.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.clastb.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)

@ -1875,6 +1958,7 @@ declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1>, i16, <vscale x
 declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1>, half, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1>, bfloat, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1>, float, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1>, double, <vscale x 2 x double>)

@ -1888,6 +1972,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)

@ -1905,6 +1990,7 @@ declare i16 @llvm.aarch64.sve.lasta.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16
 declare i32 @llvm.aarch64.sve.lasta.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.lasta.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.lasta.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.lasta.nxv2f32(<vscale x 2 x i1>, <vscale x 2 x float>)
 declare float @llvm.aarch64.sve.lasta.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.lasta.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
@ -1914,6 +2000,7 @@ declare i16 @llvm.aarch64.sve.lastb.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16
 declare i32 @llvm.aarch64.sve.lastb.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.lastb.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.lastb.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.lastb.nxv2f32(<vscale x 2 x i1>, <vscale x 2 x float>)
 declare float @llvm.aarch64.sve.lastb.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.lastb.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
--- a/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
+++ b/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
@ -57,6 +57,16 @@ define <vscale x 8 x half> @dup_f16(<vscale x 8 x half> %a, <vscale x 8 x i1> %p
  ret <vscale x 8 x half> %out
 }

+define <vscale x 8 x bfloat> @dup_bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, bfloat %b) #0 {
+; CHECK-LABEL: dup_bf16:
+; CHECK: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %a,
+                                                                   <vscale x 8 x i1> %pg,
+                                                                   bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @dup_f32(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, float %b) {
 ; CHECK-LABEL: dup_f32:
 ; CHECK: mov z0.s, p0/m, s1
@ -77,10 +87,41 @@ define <vscale x 2 x double> @dup_f64(<vscale x 2 x double> %a, <vscale x 2 x i1
  ret <vscale x 2 x double> %out
 }

+define <vscale x 8 x bfloat> @test_svdup_n_bf16_z(<vscale x 8 x i1> %pg, bfloat %op) #0 {
+; CHECK-LABEL: test_svdup_n_bf16_z:
+; CHECK: mov z1.h, #0
+; CHECK: mov z1.h, p0/m, h0
+; CHECK: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %pg, bfloat %op)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_m(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %pg, bfloat %op) #0 {
+; CHECK-LABEL: test_svdup_n_bf16_m:
+; CHECK: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %pg, bfloat %op)
+  ret <vscale x 8 x bfloat> %out
+}
+
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_x(<vscale x 8 x i1> %pg, bfloat %op) #0 {
+; CHECK-LABEL: test_svdup_n_bf16_x:
+; CHECK: mov z0.h, p0/m, h0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %pg, bfloat %op)
+  ret <vscale x 8 x bfloat> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
+++ b/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
@ -165,6 +165,14 @@ define <vscale x 8 x half> @insr_f16(<vscale x 8 x half> %a, half %b) {
  ret <vscale x 8 x half> %out
 }

+define <vscale x 8 x bfloat> @insr_bf16(<vscale x 8 x bfloat> %a, bfloat %b) #0 {
+; CHECK-LABEL: insr_bf16:
+; CHECK: insr z0.h, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> %a, bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @insr_f32(<vscale x 4 x float> %a, float %b) {
 ; CHECK-LABEL: insr_f32:
 ; CHECK: insr z0.s, s1
@ -348,6 +356,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16>, i1
 declare <vscale x 4 x i32> @llvm.aarch64.sve.insr.nxv4i32(<vscale x 4 x i32>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.insr.nxv2i64(<vscale x 2 x i64>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.insr.nxv8f16(<vscale x 8 x half>, half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat>, bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float>, float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double>, double)

@ -368,3 +377,6 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vsc
 declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/test/CodeGen/AArch64/sve-vector-splat.ll
@ -172,6 +172,15 @@ define <vscale x 16 x i1> @sve_splat_16xi1(i1 %val) {

 ;; Splats of legal floating point vector types

+define <vscale x 8 x bfloat> @splat_nxv8bf16(bfloat %val) #0 {
+; CHECK-LABEL: splat_nxv8bf16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 8 x bfloat> undef, bfloat %val, i32 0
+  %2 = shufflevector <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %2
+}
+
 define <vscale x 8 x half> @splat_nxv8f16(half %val) {
 ; CHECK-LABEL: splat_nxv8f16:
 ; CHECK: mov z0.h, h0
@ -233,6 +242,13 @@ define <vscale x 8 x half> @splat_nxv8f16_zero() {
  ret <vscale x 8 x half> zeroinitializer
 }

+define <vscale x 8 x bfloat> @splat_nxv8bf16_zero() #0 {
+; CHECK-LABEL: splat_nxv8bf16_zero:
+; CHECK: mov z0.h, #0
+; CHECK-NEXT: ret
+  ret <vscale x 8 x bfloat> zeroinitializer
+}
+
 define <vscale x 4 x half> @splat_nxv4f16_zero() {
 ; CHECK-LABEL: splat_nxv4f16_zero:
 ; CHECK: mov z0.h, #0
@ -321,3 +337,6 @@ define <vscale x 2 x double> @splat_nxv2f64_imm() {
  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x double> %2
 }
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }