From b0f83ed2ae146a797f852176544c5a35a665c968 Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Fri, 19 Jun 2020 21:27:46 +0000 Subject: [PATCH] [sve][acle] Implement some of the C intrinsics for brain float. Summary: The following intrinsics have been extended to support brain float types: svbfloat16_t svclasta[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) bfloat16_t svclasta[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t data) bfloat16_t svlasta[_bf16](svbool_t pg, svbfloat16_t op) svbfloat16_t svclastb[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) bfloat16_t svclastb[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t data) bfloat16_t svlastb[_bf16](svbool_t pg, svbfloat16_t op) svbfloat16_t svdup[_n]_bf16(bfloat16_t op) svbfloat16_t svdup[_n]_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op) svbfloat16_t svdup[_n]_bf16_x(svbool_t pg, bfloat16_t op) svbfloat16_t svdup[_n]_bf16_z(svbool_t pg, bfloat16_t op) svbfloat16_t svdupq[_n]_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3, bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) svbfloat16_t svdupq_lane[_bf16](svbfloat16_t data, uint64_t index) svbfloat16_t svinsr[_n_bf16](svbfloat16_t op1, bfloat16_t op2) Reviewers: sdesmalen, kmclaughlin, c-rhodes, ctetreau, efriedma Subscribers: tschuett, hiraditya, rkruppe, psnobl, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D82345 --- lib/Target/AArch64/AArch64SVEInstrInfo.td | 35 ++++++++ test/CodeGen/AArch64/sve-intrinsics-dup-x.ll | 12 +++ .../AArch64/sve-intrinsics-perm-select.ll | 87 +++++++++++++++++++ .../AArch64/sve-intrinsics-scalar-to-vec.ll | 41 +++++++++ test/CodeGen/AArch64/sve-intrinsics-shifts.ll | 12 +++ test/CodeGen/AArch64/sve-vector-splat.ll | 19 ++++ 6 files changed, 206 insertions(+) diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index d3a1c2789cf..0344aad8503 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -423,6 +423,11 @@ let Predicates = [HasSVE] in { defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>; defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)), + (CPY_ZPmV_H $passthru, $pg, $splat)>; + } + // Duplicate FP scalar into all vector elements def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))), (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; @@ -436,6 +441,10 @@ let Predicates = [HasSVE] in { (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + } // Duplicate +0.0 into all vector elements def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; @@ -444,6 +453,9 @@ let Predicates = [HasSVE] in { def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + } // Duplicate Int immediate into all vector elements def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), @@ -486,6 +498,10 @@ let Predicates = [HasSVE] in { defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat; + } + defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>; defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>; defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; @@ -554,11 +570,23 @@ let Predicates = [HasSVE] in { defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>; defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + def : SVE_3_Op_Pat; + } + defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>; defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>; defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>; defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>; + let Predicates = [HasSVE, HasBF16] in { + def : SVE_2_Op_Pat; + def : SVE_2_Op_Pat; + } + // continuous load with reg+immediate defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>; defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>; @@ -1499,6 +1527,13 @@ multiclass sve_prefetch; def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + + } + + let Predicates = [IsLE, HasBF16, HasSVE] in { + def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; } let Predicates = [IsLE, HasSVE, HasBF16] in { diff --git a/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll b/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll index d6f240a15be..788592c131b 100644 --- a/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll +++ b/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll @@ -81,6 +81,14 @@ define @dup_f16(half %b) { ret %out } +define @dup_bf16(bfloat %b) #0 { +; CHECK-LABEL: dup_bf16: +; CHECK: mov z0.h, h0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %b) + ret %out +} + define @dup_imm_f16(half %b) { ; CHECK-LABEL: dup_imm_f16: ; CHECK: mov z0.h, #16.00000000 @@ -126,5 +134,9 @@ declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) declare @llvm.aarch64.sve.dup.x.nxv8f16(half) +declare @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat) declare @llvm.aarch64.sve.dup.x.nxv4f32(float) declare @llvm.aarch64.sve.dup.x.nxv2f64(double) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll index e5866c1b68f..433a1cdfd8e 100644 --- a/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -57,6 +57,16 @@ define @clasta_f16( %pg, %out } +define @clasta_bf16( %pg, %a, %b) #0 { +; CHECK-LABEL: clasta_bf16: +; CHECK: clasta z0.h, p0, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.clasta.nxv8bf16( %pg, + %a, + %b) + ret %out +} + define @clasta_f32( %pg, %a, %b) { ; CHECK-LABEL: clasta_f32: ; CHECK: clasta z0.s, p0, z0.s, z1.s @@ -131,6 +141,16 @@ define half @clasta_n_f16( %pg, half %a, %b ret half %out } +define bfloat @clasta_n_bf16( %pg, bfloat %a, %b) #0 { +; CHECK-LABEL: clasta_n_bf16: +; CHECK: clasta h0, p0, h0, z1.h +; CHECK-NEXT: ret + %out = call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16( %pg, + bfloat %a, + %b) + ret bfloat %out +} + define float @clasta_n_f32( %pg, float %a, %b) { ; CHECK-LABEL: clasta_n_f32: ; CHECK: clasta s0, p0, s0, z1.s @@ -205,6 +225,16 @@ define @clastb_f16( %pg, %out } +define @clastb_bf16( %pg, %a, %b) #0 { +; CHECK-LABEL: clastb_bf16: +; CHECK: clastb z0.h, p0, z0.h, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.clastb.nxv8bf16( %pg, + %a, + %b) + ret %out +} + define @clastb_f32( %pg, %a, %b) { ; CHECK-LABEL: clastb_f32: ; CHECK: clastb z0.s, p0, z0.s, z1.s @@ -279,6 +309,16 @@ define half @clastb_n_f16( %pg, half %a, %b ret half %out } +define bfloat @clastb_n_bf16( %pg, bfloat %a, %b) #0 { +; CHECK-LABEL: clastb_n_bf16: +; CHECK: clastb h0, p0, h0, z1.h +; CHECK-NEXT: ret + %out = call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16( %pg, + bfloat %a, + %b) + ret bfloat %out +} + define float @clastb_n_f32( %pg, float %a, %b) { ; CHECK-LABEL: clastb_n_f32: ; CHECK: clastb s0, p0, s0, z1.s @@ -343,6 +383,14 @@ define @dupq_f16( %a) { ret %out } +define @dupq_bf16( %a) #0 { +; CHECK-LABEL: dupq_bf16: +; CHECK: mov z0.q, q0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %a, i64 0) + ret %out +} + define @dupq_f32( %a) { ; CHECK-LABEL: dupq_f32: ; CHECK: mov z0.q, z0.q[1] @@ -432,6 +480,20 @@ define @dupq_lane_f16( %a, i64 %idx) { ret %out } +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_bf16( %a, i64 %idx) #0 { +; CHECK-LABEL: dupq_lane_bf16: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %a, i64 %idx) + ret %out +} + ; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). define @dupq_lane_f32( %a, i64 %idx) { ; CHECK-LABEL: dupq_lane_f32: @@ -605,6 +667,15 @@ define half @lasta_f16( %pg, %a) { ret half %res } +define bfloat @lasta_bf16( %pg, %a) #0 { +; CHECK-LABEL: lasta_bf16 +; CHECK: lasta h0, p0, z0.h +; CHECK-NEXT: ret + %res = call bfloat @llvm.aarch64.sve.lasta.nxv8bf16( %pg, + %a) + ret bfloat %res +} + define float @lasta_f32( %pg, %a) { ; CHECK-LABEL: lasta_f32 ; CHECK: lasta s0, p0, z0.s @@ -681,6 +752,15 @@ define half @lastb_f16( %pg, %a) { ret half %res } +define bfloat @lastb_bf16( %pg, %a) #0 { +; CHECK-LABEL: lastb_bf16 +; CHECK: lastb h0, p0, z0.h +; CHECK-NEXT: ret + %res = call bfloat @llvm.aarch64.sve.lastb.nxv8bf16( %pg, + %a) + ret bfloat %res +} + define float @lastb_f32( %pg, %a) { ; CHECK-LABEL: lastb_f32 ; CHECK: lastb s0, p0, z0.s @@ -1851,6 +1931,7 @@ declare @llvm.aarch64.sve.clasta.nxv8i16(, < declare @llvm.aarch64.sve.clasta.nxv4i32(, , ) declare @llvm.aarch64.sve.clasta.nxv2i64(, , ) declare @llvm.aarch64.sve.clasta.nxv8f16(, , ) +declare @llvm.aarch64.sve.clasta.nxv8bf16(, , ) declare @llvm.aarch64.sve.clasta.nxv4f32(, , ) declare @llvm.aarch64.sve.clasta.nxv2f64(, , ) @@ -1859,6 +1940,7 @@ declare i16 @llvm.aarch64.sve.clasta.n.nxv8i16(, i16, , i32, ) declare i64 @llvm.aarch64.sve.clasta.n.nxv2i64(, i64, ) declare half @llvm.aarch64.sve.clasta.n.nxv8f16(, half, ) +declare bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(, bfloat, ) declare float @llvm.aarch64.sve.clasta.n.nxv4f32(, float, ) declare double @llvm.aarch64.sve.clasta.n.nxv2f64(, double, ) @@ -1867,6 +1949,7 @@ declare @llvm.aarch64.sve.clastb.nxv8i16(, < declare @llvm.aarch64.sve.clastb.nxv4i32(, , ) declare @llvm.aarch64.sve.clastb.nxv2i64(, , ) declare @llvm.aarch64.sve.clastb.nxv8f16(, , ) +declare @llvm.aarch64.sve.clastb.nxv8bf16(, , ) declare @llvm.aarch64.sve.clastb.nxv4f32(, , ) declare @llvm.aarch64.sve.clastb.nxv2f64(, , ) @@ -1875,6 +1958,7 @@ declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(, i16, , i32, ) declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(, i64, ) declare half @llvm.aarch64.sve.clastb.n.nxv8f16(, half, ) +declare bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(, bfloat, ) declare float @llvm.aarch64.sve.clastb.n.nxv4f32(, float, ) declare double @llvm.aarch64.sve.clastb.n.nxv2f64(, double, ) @@ -1888,6 +1972,7 @@ declare @llvm.aarch64.sve.dupq.lane.nxv8i16( @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv2i64(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8bf16(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv2f64(, i64) @@ -1905,6 +1990,7 @@ declare i16 @llvm.aarch64.sve.lasta.nxv8i16(, , ) declare i64 @llvm.aarch64.sve.lasta.nxv2i64(, ) declare half @llvm.aarch64.sve.lasta.nxv8f16(, ) +declare bfloat @llvm.aarch64.sve.lasta.nxv8bf16(, ) declare float @llvm.aarch64.sve.lasta.nxv2f32(, ) declare float @llvm.aarch64.sve.lasta.nxv4f32(, ) declare double @llvm.aarch64.sve.lasta.nxv2f64(, ) @@ -1914,6 +2000,7 @@ declare i16 @llvm.aarch64.sve.lastb.nxv8i16(, , ) declare i64 @llvm.aarch64.sve.lastb.nxv2i64(, ) declare half @llvm.aarch64.sve.lastb.nxv8f16(, ) +declare bfloat @llvm.aarch64.sve.lastb.nxv8bf16(, ) declare float @llvm.aarch64.sve.lastb.nxv2f32(, ) declare float @llvm.aarch64.sve.lastb.nxv4f32(, ) declare double @llvm.aarch64.sve.lastb.nxv2f64(, ) diff --git a/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll b/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll index a9ff1648f3b..9f679aa6dc4 100644 --- a/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll +++ b/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll @@ -57,6 +57,16 @@ define @dup_f16( %a, %p ret %out } +define @dup_bf16( %a, %pg, bfloat %b) #0 { +; CHECK-LABEL: dup_bf16: +; CHECK: mov z0.h, p0/m, h1 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv8bf16( %a, + %pg, + bfloat %b) + ret %out +} + define @dup_f32( %a, %pg, float %b) { ; CHECK-LABEL: dup_f32: ; CHECK: mov z0.s, p0/m, s1 @@ -77,10 +87,41 @@ define @dup_f64( %a, %out } +define @test_svdup_n_bf16_z( %pg, bfloat %op) #0 { +; CHECK-LABEL: test_svdup_n_bf16_z: +; CHECK: mov z1.h, #0 +; CHECK: mov z1.h, p0/m, h0 +; CHECK: mov z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv8bf16( zeroinitializer, %pg, bfloat %op) + ret %out +} + +define @test_svdup_n_bf16_m( %inactive, %pg, bfloat %op) #0 { +; CHECK-LABEL: test_svdup_n_bf16_m: +; CHECK: mov z0.h, p0/m, h1 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv8bf16( %inactive, %pg, bfloat %op) + ret %out +} + + +define @test_svdup_n_bf16_x( %pg, bfloat %op) #0 { +; CHECK-LABEL: test_svdup_n_bf16_x: +; CHECK: mov z0.h, p0/m, h0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dup.nxv8bf16( undef, %pg, bfloat %op) + ret %out +} + declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) declare @llvm.aarch64.sve.dup.nxv2i64(, , i64) declare @llvm.aarch64.sve.dup.nxv8f16(, , half) +declare @llvm.aarch64.sve.dup.nxv8bf16(, , bfloat) declare @llvm.aarch64.sve.dup.nxv4f32(, , float) declare @llvm.aarch64.sve.dup.nxv2f64(, , double) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/test/CodeGen/AArch64/sve-intrinsics-shifts.ll b/test/CodeGen/AArch64/sve-intrinsics-shifts.ll index 0333eb79714..dd884d5577f 100644 --- a/test/CodeGen/AArch64/sve-intrinsics-shifts.ll +++ b/test/CodeGen/AArch64/sve-intrinsics-shifts.ll @@ -165,6 +165,14 @@ define @insr_f16( %a, half %b) { ret %out } +define @insr_bf16( %a, bfloat %b) #0 { +; CHECK-LABEL: insr_bf16: +; CHECK: insr z0.h, h1 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.insr.nxv8bf16( %a, bfloat %b) + ret %out +} + define @insr_f32( %a, float %b) { ; CHECK-LABEL: insr_f32: ; CHECK: insr z0.s, s1 @@ -348,6 +356,7 @@ declare @llvm.aarch64.sve.insr.nxv8i16(, i1 declare @llvm.aarch64.sve.insr.nxv4i32(, i32) declare @llvm.aarch64.sve.insr.nxv2i64(, i64) declare @llvm.aarch64.sve.insr.nxv8f16(, half) +declare @llvm.aarch64.sve.insr.nxv8bf16(, bfloat) declare @llvm.aarch64.sve.insr.nxv4f32(, float) declare @llvm.aarch64.sve.insr.nxv2f64(, double) @@ -368,3 +377,6 @@ declare @llvm.aarch64.sve.lsr.nxv2i64(, @llvm.aarch64.sve.lsr.wide.nxv16i8(, , ) declare @llvm.aarch64.sve.lsr.wide.nxv8i16(, , ) declare @llvm.aarch64.sve.lsr.wide.nxv4i32(, , ) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/test/CodeGen/AArch64/sve-vector-splat.ll b/test/CodeGen/AArch64/sve-vector-splat.ll index 5bacbee042c..af43f8fc97e 100644 --- a/test/CodeGen/AArch64/sve-vector-splat.ll +++ b/test/CodeGen/AArch64/sve-vector-splat.ll @@ -172,6 +172,15 @@ define @sve_splat_16xi1(i1 %val) { ;; Splats of legal floating point vector types +define @splat_nxv8bf16(bfloat %val) #0 { +; CHECK-LABEL: splat_nxv8bf16: +; CHECK: mov z0.h, h0 +; CHECK-NEXT: ret + %1 = insertelement undef, bfloat %val, i32 0 + %2 = shufflevector %1, undef, zeroinitializer + ret %2 +} + define @splat_nxv8f16(half %val) { ; CHECK-LABEL: splat_nxv8f16: ; CHECK: mov z0.h, h0 @@ -233,6 +242,13 @@ define @splat_nxv8f16_zero() { ret zeroinitializer } +define @splat_nxv8bf16_zero() #0 { +; CHECK-LABEL: splat_nxv8bf16_zero: +; CHECK: mov z0.h, #0 +; CHECK-NEXT: ret + ret zeroinitializer +} + define @splat_nxv4f16_zero() { ; CHECK-LABEL: splat_nxv4f16_zero: ; CHECK: mov z0.h, #0 @@ -321,3 +337,6 @@ define @splat_nxv2f64_imm() { %2 = shufflevector %1, undef, zeroinitializer ret %2 } + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" }