Re-commit "[GlobalISel] Add legalization support for non-power-2 loads and stores""

This is an old commit that exposed a bug in the GISel importer, which caused non-truncating stores to be selected for truncating store patterns. Now that's been fixed in r367737 this can go back in. llvm-svn: 367739
2024-11-24 03:33:20 +01:00 · 2019-08-02 23:44:24 +00:00 · 2019-08-02 23:44:24 +00:00 · d894b5f8e3
commit d894b5f8e3
parent 3d80f1333d
5 changed files with 154 additions and 32 deletions
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@ -640,6 +640,10 @@ public:
    return actionIf(LegalizeAction::Unsupported,
                    LegalityPredicates::memSizeInBytesNotPow2(0));
  }
  LegalizeRuleSet &lowerIfMemSizeNotPow2() {
    return actionIf(LegalizeAction::Lower,
                    LegalityPredicates::memSizeInBytesNotPow2(0));
  }
  LegalizeRuleSet &customIf(LegalityPredicate Predicate) {
    // We have no choice but conservatively assume that a custom action with a
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@ -1761,11 +1761,57 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
    LLT DstTy = MRI.getType(DstReg);
    auto &MMO = **MI.memoperands_begin();
-    if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) {
+    if (DstTy.getSizeInBits() == MMO.getSizeInBits()) {
-      // In the case of G_LOAD, this was a non-extending load already and we're
+      if (MI.getOpcode() == TargetOpcode::G_LOAD) {
-      // about to lower to the same instruction.
+        // This load needs splitting into power of 2 sized loads.
-      if (MI.getOpcode() == TargetOpcode::G_LOAD)
+        if (DstTy.isVector())
          return UnableToLegalize;
        if (isPowerOf2_32(DstTy.getSizeInBits()))
          return UnableToLegalize; // Don't know what we're being asked to do.
        // Our strategy here is to generate anyextending loads for the smaller
        // types up to next power-2 result type, and then combine the two larger
        // result values together, before truncating back down to the non-pow-2
        // type.
        // E.g. v1 = i24 load =>
        // v2 = i32 load (2 byte)
        // v3 = i32 load (1 byte)
        // v4 = i32 shl v3, 16
        // v5 = i32 or v4, v2
        // v1 = i24 trunc v5
        // By doing this we generate the correct truncate which should get
        // combined away as an artifact with a matching extend.
        uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
        uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
        MachineFunction &MF = MIRBuilder.getMF();
        MachineMemOperand *LargeMMO =
            MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
        MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
            &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
        LLT PtrTy = MRI.getType(PtrReg);
        unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
        LLT AnyExtTy = LLT::scalar(AnyExtSize);
        Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
        Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
        auto LargeLoad =
            MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO);
        auto OffsetCst =
            MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
        Register GEPReg = MRI.createGenericVirtualRegister(PtrTy);
        auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
        auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
                                              *SmallMMO);
        auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
        auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
        auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
        MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
        MI.eraseFromParent();
        return Legalized;
      }
      MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
      MI.eraseFromParent();
      return Legalized;
@ -1794,6 +1840,51 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
    return UnableToLegalize;
  }
  case TargetOpcode::G_STORE: {
    // Lower a non-power of 2 store into multiple pow-2 stores.
    // E.g. split an i24 store into an i16 store + i8 store.
    // We do this by first extending the stored value to the next largest power
    // of 2 type, and then using truncating stores to store the components.
    // By doing this, likewise with G_LOAD, generate an extend that can be
    // artifact-combined away instead of leaving behind extracts.
    Register SrcReg = MI.getOperand(0).getReg();
    Register PtrReg = MI.getOperand(1).getReg();
    LLT SrcTy = MRI.getType(SrcReg);
    MachineMemOperand &MMO = **MI.memoperands_begin();
    if (SrcTy.getSizeInBits() != MMO.getSizeInBits())
      return UnableToLegalize;
    if (SrcTy.isVector())
      return UnableToLegalize;
    if (isPowerOf2_32(SrcTy.getSizeInBits()))
      return UnableToLegalize; // Don't know what we're being asked to do.
    // Extend to the next pow-2.
    const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
    auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
    // Obtain the smaller value by shifting away the larger value.
    uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
    uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
    auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
    auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
    // Generate the GEP and truncating stores.
    LLT PtrTy = MRI.getType(PtrReg);
    auto OffsetCst =
        MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
    Register GEPReg = MRI.createGenericVirtualRegister(PtrTy);
    auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
    MachineFunction &MF = MIRBuilder.getMF();
    MachineMemOperand *LargeMMO =
        MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
    MachineMemOperand *SmallMMO =
        MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
    MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
    MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
    MI.eraseFromParent();
    return Legalized;
  }
  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
  case TargetOpcode::G_CTTZ_ZERO_UNDEF:
  case TargetOpcode::G_CTLZ:
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@ -256,14 +256,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
      .legalForTypesWithMemDesc({{s32, p0, 8, 8},
                                 {s32, p0, 16, 8}})
      .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0)
+      .lowerIfMemSizeNotPow2()
      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
      //       how to do that yet.
      .unsupportedIfMemSizeNotPow2()
      // Lower any any-extending loads left into G_ANYEXT and G_LOAD
      .lowerIf([=](const LegalityQuery &Query) {
        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
      })
      .widenScalarToNextPow2(0)
      .clampMaxNumElements(0, s32, 2)
      .clampMaxNumElements(0, s64, 1)
      .customIf(IsPtrVecPred);
@ -271,6 +269,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
  getActionDefinitionsBuilder(G_STORE)
      .legalForTypesWithMemDesc({{s8, p0, 8, 8},
                                 {s16, p0, 16, 8},
                                 {s32, p0, 8, 8},
                                 {s32, p0, 16, 8},
                                 {s32, p0, 32, 8},
                                 {s64, p0, 64, 8},
                                 {p0, p0, 64, 8},
@ -282,10 +282,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                                 {v4s32, p0, 128, 8},
                                 {v2s64, p0, 128, 8}})
      .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0)
+      .lowerIfMemSizeNotPow2()
      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
      //       how to do that yet.
      .unsupportedIfMemSizeNotPow2()
      .lowerIf([=](const LegalityQuery &Query) {
        return Query.Types[0].isScalar() &&
               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@ -54,26 +54,6 @@ false:
 }
 ; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s32) = G_LOAD %1:_(p0) :: (load 3 from `i24* undef`, align 1) (in function: odd_type_load)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type_load
 ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type_load
 define i32 @odd_type_load() {
 entry:
  %ld = load i24, i24* undef, align 1
  %cst = zext i24 %ld to i32
  ret i32 %cst
 }
  ; General legalizer inability to handle types whose size wasn't a power of 2.
 ; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %1:_(s42), %0:_(p0) :: (store 6 into %ir.addr, align 8) (in function: odd_type)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type
 ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type:
 define void @odd_type(i42* %addr) {
  %val42 = load i42, i42* %addr
  store i42 %val42, i42* %addr
  ret void
 }
 ; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %1:_(<7 x s32>), %0:_(p0) :: (store 28 into %ir.addr, align 32) (in function: odd_vector)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_vector
 ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_vector:
--- a/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir
@ -0,0 +1,50 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 --- |
  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
  target triple = "aarch64"
  define i32 @load_store_test(i24* %ptr, i24* %ptr2) {
    %val = load i24, i24* %ptr
    store i24 %val, i24* %ptr2
    ret i32 0
  }
 ...
 ---
 name:            load_store_test
 alignment:       2
 tracksRegLiveness: true
 body:             |
  bb.1 (%ir-block.0):
    liveins: $x0, $x1
    ; CHECK-LABEL: name: load_store_test
    ; CHECK: liveins: $x0, $x1
    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2 from %ir.ptr, align 4)
    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64)
    ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 1 from %ir.ptr + 2, align 4)
    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C2]](s32)
    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LOAD]]
    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C3]](s64)
    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64)
    ; CHECK: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store 2 into %ir.ptr2, align 4)
    ; CHECK: G_STORE [[LSHR]](s32), [[GEP1]](p0) :: (store 1 into %ir.ptr2 + 2, align 4)
    ; CHECK: $w0 = COPY [[C]](s32)
    ; CHECK: RET_ReallyLR implicit $w0
    %0:_(p0) = COPY $x0
    %1:_(p0) = COPY $x1
    %3:_(s32) = G_CONSTANT i32 0
    %2:_(s24) = G_LOAD %0(p0) :: (load 3 from %ir.ptr, align 4)
    G_STORE %2(s24), %1(p0) :: (store 3 into %ir.ptr2, align 4)
    $w0 = COPY %3(s32)
    RET_ReallyLR implicit $w0
 ...