diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 480600dcd93..b22eb3b154f 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -47,18 +47,6 @@ static cl::opt cl::desc("Call nonlazybind functions via direct GOT load"), cl::init(false), cl::Hidden); -static cl::opt SVEVectorBitsMax( - "aarch64-sve-vector-bits-max", - cl::desc("Assume SVE vector registers are at most this big, " - "with zero meaning no maximum size is assumed."), - cl::init(0), cl::Hidden); - -static cl::opt SVEVectorBitsMin( - "aarch64-sve-vector-bits-min", - cl::desc("Assume SVE vector registers are at least this big, " - "with zero meaning no minimum size is assumed."), - cl::init(0), cl::Hidden); - static cl::opt UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen.")); @@ -210,14 +198,17 @@ void AArch64Subtarget::initializeProperties() { AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - const TargetMachine &TM, bool LittleEndian) + const TargetMachine &TM, bool LittleEndian, + unsigned MinSVEVectorSizeInBitsOverride, + unsigned MaxSVEVectorSizeInBitsOverride) : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), IsLittle(LittleEndian), - TargetTriple(TT), FrameLowering(), - InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), - TLInfo(TM, *this) { + MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), + MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), + FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), + TSInfo(), TLInfo(TM, *this) { if (AArch64::isX18ReservedByDefault(TT)) ReserveXRegister.set(18); @@ -356,28 +347,6 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { MFI.computeMaxCallFrameSize(MF); } -unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const { - assert(HasSVE && "Tried to get SVE vector length without SVE support!"); - assert(SVEVectorBitsMax % 128 == 0 && - "SVE requires vector length in multiples of 128!"); - assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && - "Minimum SVE vector size should not be larger than its maximum!"); - if (SVEVectorBitsMax == 0) - return 0; - return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; -} - -unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { - assert(HasSVE && "Tried to get SVE vector length without SVE support!"); - assert(SVEVectorBitsMin % 128 == 0 && - "SVE requires vector length in multiples of 128!"); - assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && - "Minimum SVE vector size should not be larger than its maximum!"); - if (SVEVectorBitsMax == 0) - return (SVEVectorBitsMin / 128) * 128; - return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; -} - bool AArch64Subtarget::useSVEForFixedLengthVectors() const { // Prefer NEON unless larger SVE registers are available. return hasSVE() && getMinSVEVectorSizeInBits() >= 256; diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 8345e9c5861..ea1fbc18d3b 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -261,6 +261,9 @@ protected: bool IsLittle; + unsigned MinSVEVectorSizeInBits; + unsigned MaxSVEVectorSizeInBits; + /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; @@ -291,7 +294,9 @@ public: /// of the specified triple. AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, - bool LittleEndian); + bool LittleEndian, + unsigned MinSVEVectorSizeInBitsOverride = 0, + unsigned MaxSVEVectorSizeInBitsOverride = 0); const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; @@ -585,8 +590,16 @@ public: // Return the known range for the bit length of SVE data registers. A value // of 0 means nothing is known about that particular limit beyong what's // implied by the architecture. - unsigned getMaxSVEVectorSizeInBits() const; - unsigned getMinSVEVectorSizeInBits() const; + unsigned getMaxSVEVectorSizeInBits() const { + assert(HasSVE && "Tried to get SVE vector length without SVE support!"); + return MaxSVEVectorSizeInBits; + } + + unsigned getMinSVEVectorSizeInBits() const { + assert(HasSVE && "Tried to get SVE vector length without SVE support!"); + return MinSVEVectorSizeInBits; + } + bool useSVEForFixedLengthVectors() const; }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index ed02a9eb083..d6eba1e5f1d 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -161,6 +161,18 @@ static cl::opt cl::desc("Enable the AAcrh64 branch target pass"), cl::init(true)); +static cl::opt SVEVectorBitsMaxOpt( + "aarch64-sve-vector-bits-max", + cl::desc("Assume SVE vector registers are at most this big, " + "with zero meaning no maximum size is assumed."), + cl::init(0), cl::Hidden); + +static cl::opt SVEVectorBitsMinOpt( + "aarch64-sve-vector-bits-min", + cl::desc("Assume SVE vector registers are at least this big, " + "with zero meaning no minimum size is assumed."), + cl::init(0), cl::Hidden); + extern cl::opt EnableHomogeneousPrologEpilog; extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { @@ -349,14 +361,54 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { std::string FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; - auto &I = SubtargetMap[CPU + FS]; + SmallString<512> Key; + + unsigned MinSVEVectorSize = 0; + unsigned MaxSVEVectorSize = 0; + Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange); + if (VScaleRangeAttr.isValid()) { + std::tie(MinSVEVectorSize, MaxSVEVectorSize) = + VScaleRangeAttr.getVScaleRangeArgs(); + MinSVEVectorSize *= 128; + MaxSVEVectorSize *= 128; + } else { + MinSVEVectorSize = SVEVectorBitsMinOpt; + MaxSVEVectorSize = SVEVectorBitsMaxOpt; + } + + assert(MinSVEVectorSize % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert(MaxSVEVectorSize % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert((MaxSVEVectorSize >= MinSVEVectorSize || MaxSVEVectorSize == 0) && + "Minimum SVE vector size should not be larger than its maximum!"); + + // Sanitize user input in case of no asserts + if (MaxSVEVectorSize == 0) + MinSVEVectorSize = (MinSVEVectorSize / 128) * 128; + else { + MinSVEVectorSize = + (std::min(MinSVEVectorSize, MaxSVEVectorSize) / 128) * 128; + MaxSVEVectorSize = + (std::max(MinSVEVectorSize, MaxSVEVectorSize) / 128) * 128; + } + + Key += "SVEMin"; + Key += std::to_string(MinSVEVectorSize); + Key += "SVEMax"; + Key += std::to_string(MaxSVEVectorSize); + Key += CPU; + Key += FS; + + auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); I = std::make_unique(TargetTriple, CPU, FS, *this, - isLittle); + isLittle, MinSVEVectorSize, + MaxSVEVectorSize); } return I.get(); } diff --git a/test/CodeGen/AArch64/sve-vscale-attr.ll b/test/CodeGen/AArch64/sve-vscale-attr.ll new file mode 100644 index 00000000000..7ffee3a0c23 --- /dev/null +++ b/test/CodeGen/AArch64/sve-vscale-attr.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOARG +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ARG + +target triple = "aarch64-unknown-linux-gnu" + +define void @func_vscale_none(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-NOARG-LABEL: func_vscale_none: +; CHECK-NOARG: // %bb.0: +; CHECK-NOARG-NEXT: ldp q0, q1, [x0] +; CHECK-NOARG-NEXT: ldp q2, q3, [x1] +; CHECK-NOARG-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NOARG-NEXT: ldp q7, q6, [x1, #32] +; CHECK-NOARG-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NOARG-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NOARG-NEXT: add v2.4s, v5.4s, v6.4s +; CHECK-NOARG-NEXT: add v3.4s, v4.4s, v7.4s +; CHECK-NOARG-NEXT: stp q3, q2, [x0, #32] +; CHECK-NOARG-NEXT: stp q0, q1, [x0] +; CHECK-NOARG-NEXT: ret +; +; CHECK-ARG-LABEL: func_vscale_none: +; CHECK-ARG: // %bb.0: +; CHECK-ARG-NEXT: ptrue p0.s, vl16 +; CHECK-ARG-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-ARG-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-ARG-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-ARG-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-ARG-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } + +define void @func_vscale1_1(<16 x i32>* %a, <16 x i32>* %b) #1 { +; CHECK-LABEL: func_vscale1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NEXT: ldp q7, q6, [x1, #32] +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v2.4s, v5.4s, v6.4s +; CHECK-NEXT: add v3.4s, v4.4s, v7.4s +; CHECK-NEXT: stp q3, q2, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #1 = { "target-features"="+sve" vscale_range(1,1) } + +define void @func_vscale2_2(<16 x i32>* %a, <16 x i32>* %b) #2 { +; CHECK-LABEL: func_vscale2_2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: add x8, x0, #32 // =32 +; CHECK-NEXT: add x9, x1, #32 // =32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x9] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #2 = { "target-features"="+sve" vscale_range(2,2) } + +define void @func_vscale2_4(<16 x i32>* %a, <16 x i32>* %b) #3 { +; CHECK-LABEL: func_vscale2_4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: add x8, x0, #32 // =32 +; CHECK-NEXT: add x9, x1, #32 // =32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x9] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #3 = { "target-features"="+sve" vscale_range(2,4) } + +define void @func_vscale4_4(<16 x i32>* %a, <16 x i32>* %b) #4 { +; CHECK-LABEL: func_vscale4_4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #4 = { "target-features"="+sve" vscale_range(4,4) } + +define void @func_vscale8_8(<16 x i32>* %a, <16 x i32>* %b) #5 { +; CHECK-LABEL: func_vscale8_8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #5 = { "target-features"="+sve" vscale_range(8,8) }