diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td index d00456123f5..6065a8c36c2 100644 --- a/include/llvm/IR/IntrinsicsAArch64.td +++ b/include/llvm/IR/IntrinsicsAArch64.td @@ -1311,6 +1311,7 @@ def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ld1rq : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld1ro : AdvSIMD_1Vec_PredLoad_Intrinsic; // // Stores diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index e9698c0b4f3..0c2effa1eb7 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1463,6 +1463,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1"; case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S"; case AArch64ISD::LD1RQ: return "AArch64ISD::LD1RQ"; + case AArch64ISD::LD1RO: return "AArch64ISD::LD1RO"; case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; @@ -11885,7 +11886,10 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { return L; } -static SDValue performLD1RQCombine(SDNode *N, SelectionDAG &DAG) { +template +static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { + static_assert(Opcode == AArch64ISD::LD1RQ || Opcode == AArch64ISD::LD1RO, + "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); @@ -11894,13 +11898,13 @@ static SDValue performLD1RQCombine(SDNode *N, SelectionDAG &DAG) { LoadVT = VT.changeTypeToInteger(); SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; - SDValue Load = DAG.getNode(AArch64ISD::LD1RQ, DL, {LoadVT, MVT::Other}, Ops); + SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); SDValue LoadChain = SDValue(Load.getNode(), 1); if (VT.isFloatingPoint()) Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); - return DAG.getMergeValues({ Load, LoadChain }, DL); + return DAG.getMergeValues({Load, LoadChain}, DL); } static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { @@ -13493,7 +13497,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ld1rq: - return performLD1RQCombine(N, DAG); + return performLD1ReplicateCombine(N, DAG); + case Intrinsic::aarch64_sve_ld1ro: + return performLD1ReplicateCombine(N, DAG); case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 2ae86b66576..81af7e2235a 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -253,6 +253,7 @@ enum NodeType : unsigned { LDFF1, LDFF1S, LD1RQ, + LD1RO, // Unsigned gather loads. GLD1, diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index f5b983ac757..c33a4b4fc0f 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -31,12 +31,13 @@ def AArch64ldff1s : SDNode<"AArch64ISD::LDFF1S", SDT_AArch64_LD1, [SDNPHasChain, // Contiguous load and replicate - node definitions // -def SDT_AArch64_LD1RQ : SDTypeProfile<1, 2, [ +def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; -def AArch64ld1rq : SDNode<"AArch64ISD::LD1RQ", SDT_AArch64_LD1RQ, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1rq : SDNode<"AArch64ISD::LD1RQ", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1ro : SDNode<"AArch64ISD::LD1RO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; // Gather loads - node definitions // @@ -1434,6 +1435,7 @@ multiclass sve_prefetch; def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; @@ -1954,10 +1956,10 @@ let Predicates = [HasSVE, HasMatMulFP32] in { let Predicates = [HasSVE, HasMatMulFP64] in { defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>; - defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8>; - defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16>; - defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32>; - defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64>; + defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro>; + defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro>; + defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro>; + defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro>; defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8>; defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16>; defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32>; diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td index 25702e15ab5..6c8d49f1f1a 100644 --- a/lib/Target/AArch64/SVEInstrFormats.td +++ b/lib/Target/AArch64/SVEInstrFormats.td @@ -7663,7 +7663,7 @@ class sve_mem_ldor_si sz, string asm, RegisterOperand VecList> } multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, - ZPRRegOp zprty> { + ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> { def NAME : sve_mem_ldor_si; def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; @@ -7671,6 +7671,11 @@ multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; + + // Base addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; + } class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, diff --git a/test/CodeGen/AArch64/sve-intrinsics-ld1ro.ll b/test/CodeGen/AArch64/sve-intrinsics-ld1ro.ll new file mode 100644 index 00000000000..50b7c003f4d --- /dev/null +++ b/test/CodeGen/AArch64/sve-intrinsics-ld1ro.ll @@ -0,0 +1,84 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s | FileCheck %s + +; +; LD1ROB +; + +define @ld1rob_i8( %pred, i8* %addr) nounwind { +; CHECK-LABEL: ld1rob_i8: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pred, i8* %addr) + ret %res +} + +; +; LD1ROH +; + +define @ld1roh_i16( %pred, i16* %addr) nounwind { +; CHECK-LABEL: ld1roh_i16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pred, i16* %addr) + ret %res +} + +define @ld1roh_half( %pred, half* %addr) nounwind { +; CHECK-LABEL: ld1roh_half: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv8f16( %pred, half* %addr) + ret %res +} + +; +; LD1ROW +; + +define @ld1row_i32( %pred, i32* %addr) nounwind { +; CHECK-LABEL: ld1row_i32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pred, i32* %addr) + ret %res +} + +define @ld1row_float( %pred, float* %addr) nounwind { +; CHECK-LABEL: ld1row_float: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv4f32( %pred, float* %addr) + ret %res +} + +; +; LD1ROD +; + +define @ld1rod_i64( %pred, i64* %addr) nounwind { +; CHECK-LABEL: ld1rod_i64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pred, i64* %addr) + ret %res +} + +define @ld1rod_double( %pred, double* %addr) nounwind { +; CHECK-LABEL: ld1rod_double: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1ro.nxv2f64( %pred, double* %addr) + ret %res +} + +declare @llvm.aarch64.sve.ld1ro.nxv16i8(, i8*) + +declare @llvm.aarch64.sve.ld1ro.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1ro.nxv8f16(, half*) + +declare @llvm.aarch64.sve.ld1ro.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1ro.nxv4f32(, float*) + +declare @llvm.aarch64.sve.ld1ro.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1ro.nxv2f64(, double*)