[AMDGPU] Skip additional folding on the same operand.

Reviewers: rampitec, arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69355
2024-11-23 11:13:28 +01:00 · 2019-10-23 15:19:06 -04:00 · 2019-10-23 15:19:06 -04:00 · b532a94abc
commit b532a94abc
parent 87e90e0fd2
3 changed files with 61 additions and 7 deletions
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@ -312,6 +312,19 @@ static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
  return false;
 }

+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+                                MachineInstr *MI, unsigned OpNo,
+                                MachineOperand *FoldOp, bool Commuted = false,
+                                int ShrinkOp = -1) {
+  // Skip additional folding on the same operand.
+  for (FoldCandidate &Fold : FoldList)
+    if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
+      return;
+  LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
+                    << " operand " << OpNo << "\n  " << *MI << '\n');
+  FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+}
+
 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                             MachineInstr *MI, unsigned OpNo,
                             MachineOperand *OpToFold,
@ -344,7 +357,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
    // Special case for s_setreg_b32
    if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
      MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
-      FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+      appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
      return true;
    }

@ -403,8 +416,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
        unsigned MaybeCommutedOpc = MI->getOpcode();
        int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);

-        FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
-                                         Op32));
+        appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
        return true;
      }

@ -412,11 +424,11 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
      return false;
    }

-    FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
+    appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
    return true;
  }

-  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+  appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
  return true;
 }

@ -494,7 +506,7 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
  if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
    return false;

-  FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
+  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Op);
  return true;
 }

@ -1398,5 +1410,5 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
      foldInstOperand(MI, OpToFold);
    }
  }
-  return false;
+  return true;
 }
--- a/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/test/CodeGen/AMDGPU/fold-imm-copy.mir
@ -22,3 +22,21 @@ body:             |
    %9:vgpr_32 = COPY %8
    %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec
 ...
+
+---
+# GCN-LABEL:       name: no_extra_fold_on_same_opnd
+# The first XOR needs commuting to fold that immediate operand.
+# GCN:             V_XOR_B32_e32 {{.*}} 0, %1
+# GCN:             V_XOR_B32_e32 %2, %4.sub0
+name: no_extra_fold_on_same_opnd
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:vgpr_32  = IMPLICIT_DEF
+    %1:vgpr_32  = IMPLICIT_DEF
+    %2:vgpr_32  = IMPLICIT_DEF
+    %3:vgpr_32  = V_MOV_B32_e32 0, implicit $exec
+    %4:vreg_64  = REG_SEQUENCE killed %0, %subreg.sub0, killed %3, %subreg.sub1
+    %5:vgpr_32  = V_XOR_B32_e32 %1, %4.sub1, implicit $exec
+    %6:vgpr_32  = V_XOR_B32_e32 %2, %4.sub0, implicit $exec
+...
--- a/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@ -124,6 +124,30 @@ define amdgpu_kernel void @no_fold_tied_subregister() {
  ret void
 }

+; There should be exact one folding on the same operand.
+; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @no_extra_fold_on_same_opnd() {
+entry:
+  %s0 = load i32, i32 addrspace(5)* undef, align 4
+  %s0.i64= zext i32 %s0 to i64
+  br label %for.body.i.i
+
+for.body.i.i:
+  %s1 = load i32, i32 addrspace(1)* undef, align 8
+  %s1.i64 = sext i32 %s1 to i64
+  %xor = xor i64 %s1.i64, %s0.i64
+  %flag = icmp ult i64 %xor, 8
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  unreachable
+
+if.else:
+  unreachable
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0

 attributes #0 = { nounwind readnone }