[llvm-mca] Add support for instructions with a variadic number of operands.

By default, llvm-mca conservatively assumes that a register operand from the variadic sequence is both a register read and a register write. That is because MCInstrDesc doesn't describe extra variadic operands; we don't have enough dataflow information to tell which register operands from the variadic sequence is a definition, and which is a use instead. However, if a variadic instruction is flagged 'mayStore' (but not 'mayLoad'), and it has no 'unmodeledSideEffects', then llvm-mca (very) optimistically assumes that any register operand in the variadic sequence is a register read only. Conversely, if a variadic instruction is marked as 'mayLoad' (but not 'mayStore'), and it has no 'unmodeledSideEffects', then llvm-mca optimistically assumes that any extra register operand is a register definition only. These assumptions work quite well for variadic load/store multiple instructions defined by the ARM backend. llvm-svn: 347522
2024-11-23 03:02:36 +01:00 · 2018-11-25 12:46:24 +00:00 · 2018-11-25 12:46:24 +00:00 · 5864f5b887
commit 5864f5b887
parent 04925aa9f6
2 changed files with 146 additions and 18 deletions
--- a/test/tools/llvm-mca/ARM/memcpy-ldm-stm.s
+++ b/test/tools/llvm-mca/ARM/memcpy-ldm-stm.s
@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=armv7-unknown-unknown -mcpu=swift -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+  ldm	r2!, {r3, r4, r5, r6, r12, lr}
+  stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Iterations:        300
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1295
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.85
+# CHECK-NEXT: IPC:               0.46
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      18    2.00    *                   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT:  5      1     2.00           *            stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SwiftUnitDiv
+# CHECK-NEXT: [1]   - SwiftUnitP0
+# CHECK-NEXT: [2]   - SwiftUnitP1
+# CHECK-NEXT: [3]   - SwiftUnitP2
+# CHECK-NEXT: [4.0] - SwiftUnitP01
+# CHECK-NEXT: [4.1] - SwiftUnitP01
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4.0]  [4.1]
+# CHECK-NEXT:  -      -      -     4.00   2.46   2.54
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4.0]  [4.1]  Instructions:
+# CHECK-NEXT:  -      -      -     2.00   1.09   0.91   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT:  -      -      -     2.00   1.37   1.63   stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeER    .  .   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [0,1]     .D=================eER   .  .   stm	r0!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [1,0]     .  DeeeeeeeeeeeeeeeeeeER .  .   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [1,1]     .   D=================eER.  .   stm	r0!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [2,0]     .    .DeeeeeeeeeeeeeeeeeeER .   ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: [2,1]     .    . D==================eER   stm	r0!, {r3, r4, r5, r6, r12, lr}
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    0.0       ldm	r2!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: 1.     3     18.3   0.3    0.0       stm	r0!, {r3, r4, r5, r6, r12, lr}
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@ -190,14 +190,6 @@ static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
 }

 static Error verifyOperands(const MCInstrDesc &MCDesc, const MCInst &MCI) {
-  // Variadic opcodes are not correctly supported.
-  if (MCDesc.isVariadic()) {
-    if (MCI.getNumOperands() - MCDesc.getNumOperands()) {
-      return make_error<InstructionError<MCInst>>(
-          "Don't know how to process this variadic opcode.", MCI);
-    }
-  }
-
  // Count register definitions, and skip non register operands in the process.
  unsigned I, E;
  unsigned NumExplicitDefs = MCDesc.getNumDefs();
@ -281,7 +273,8 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
  if (MCDesc.hasOptionalDef())
    TotalDefs++;

-  ID.Writes.resize(TotalDefs);
+  unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands();
+  ID.Writes.resize(TotalDefs + NumVariadicOps);
  // Iterate over the operands list, and skip non-register operands.
  // The first NumExplictDefs register operands are expected to be register
  // definitions.
@ -358,6 +351,41 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
    });
  }
+
+  if (!NumVariadicOps)
+    return;
+
+  // FIXME: if an instruction opcode is flagged 'mayStore', and it has no
+  // "unmodeledSideEffects', then this logic optimistically assumes that any
+  // extra register operands in the variadic sequence is not a register
+  // definition.
+  //
+  // Otherwise, we conservatively assume that any register operand from the
+  // variadic sequence is both a register read and a register write.
+  bool AssumeUsesOnly = MCDesc.mayStore() && !MCDesc.mayLoad() &&
+                        !MCDesc.hasUnmodeledSideEffects();
+  CurrentDef = NumExplicitDefs + NumImplicitDefs + MCDesc.hasOptionalDef();
+  for (unsigned I = 0, OpIndex = MCDesc.getNumOperands();
+       I < NumVariadicOps && !AssumeUsesOnly; ++I, ++OpIndex) {
+    const MCOperand &Op = MCI.getOperand(OpIndex);
+    if (!Op.isReg())
+      continue;
+
+    WriteDescriptor &Write = ID.Writes[CurrentDef];
+    Write.OpIndex = OpIndex;
+    // Assign a default latency for this write.
+    Write.Latency = ID.MaxLatency;
+    Write.SClassOrWriteResourceID = 0;
+    Write.IsOptionalDef = false;
+    ++CurrentDef;
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def][V] OpIdx=" << Write.OpIndex
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
+  }
+
+  ID.Writes.resize(CurrentDef);
 }

 void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
@ -368,14 +396,21 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
  // Remove the optional definition.
  if (MCDesc.hasOptionalDef())
    --NumExplicitUses;
-  unsigned TotalUses = NumExplicitUses + NumImplicitUses;
-
+  unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands();
+  unsigned TotalUses = NumExplicitUses + NumImplicitUses + NumVariadicOps;
  ID.Reads.resize(TotalUses);
-  for (unsigned I = 0; I < NumExplicitUses; ++I) {
-    ReadDescriptor &Read = ID.Reads[I];
-    Read.OpIndex = MCDesc.getNumDefs() + I;
+  unsigned CurrentUse = 0;
+  for (unsigned I = 0, OpIndex = MCDesc.getNumDefs(); I < NumExplicitUses;
+       ++I, ++OpIndex) {
+    const MCOperand &Op = MCI.getOperand(OpIndex);
+    if (!Op.isReg())
+      continue;
+
+    ReadDescriptor &Read = ID.Reads[CurrentUse];
+    Read.OpIndex = OpIndex;
    Read.UseIndex = I;
    Read.SchedClassID = SchedClassID;
+    ++CurrentUse;
    LLVM_DEBUG(dbgs() << "\t\t[Use]    OpIdx=" << Read.OpIndex
                      << ", UseIndex=" << Read.UseIndex << '\n');
  }
@ -383,7 +418,7 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
  // For the purpose of ReadAdvance, implicit uses come directly after explicit
  // uses. The "UseIndex" must be updated according to that implicit layout.
  for (unsigned I = 0; I < NumImplicitUses; ++I) {
-    ReadDescriptor &Read = ID.Reads[NumExplicitUses + I];
+    ReadDescriptor &Read = ID.Reads[CurrentUse + I];
    Read.OpIndex = ~I;
    Read.UseIndex = NumExplicitUses + I;
    Read.RegisterID = MCDesc.getImplicitUses()[I];
@ -392,6 +427,32 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
                      << ", UseIndex=" << Read.UseIndex << ", RegisterID="
                      << MRI.getName(Read.RegisterID) << '\n');
  }
+
+  CurrentUse += NumImplicitUses;
+
+  // FIXME: If an instruction opcode is marked as 'mayLoad', and it has no
+  // "unmodeledSideEffects", then this logic optimistically assumes that any
+  // extra register operands in the variadic sequence are not register
+  // definition.
+
+  bool AssumeDefsOnly = !MCDesc.mayStore() && MCDesc.mayLoad() &&
+                        !MCDesc.hasUnmodeledSideEffects();
+  for (unsigned I = 0, OpIndex = MCDesc.getNumOperands();
+       I < NumVariadicOps && !AssumeDefsOnly; ++I, ++OpIndex) {
+    const MCOperand &Op = MCI.getOperand(OpIndex);
+    if (!Op.isReg())
+      continue;
+
+    ReadDescriptor &Read = ID.Reads[CurrentUse];
+    Read.OpIndex = OpIndex;
+    Read.UseIndex = NumExplicitUses + NumImplicitUses + I;
+    Read.SchedClassID = SchedClassID;
+    ++CurrentUse;
+    LLVM_DEBUG(dbgs() << "\t\t[Use][V] OpIdx=" << Read.OpIndex
+                      << ", UseIndex=" << Read.UseIndex << '\n');
+  }
+
+  ID.Reads.resize(CurrentUse);
 }

 Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
@ -431,10 +492,11 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {

  // Then obtain the scheduling class information from the instruction.
  unsigned SchedClassID = MCDesc.getSchedClass();
-  unsigned CPUID = SM.getProcessorID();
+  bool IsVariant = SM.getSchedClassDesc(SchedClassID)->isVariant();

  // Try to solve variant scheduling classes.
-  if (SchedClassID) {
+  if (IsVariant) {
+    unsigned CPUID = SM.getProcessorID();
    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);

@ -493,7 +555,8 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {

  // Now add the new descriptor.
  SchedClassID = MCDesc.getSchedClass();
-  if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
+  bool IsVariadic = MCDesc.isVariadic();
+  if (!IsVariadic && !IsVariant) {
    Descriptors[MCI.getOpcode()] = std::move(ID);
    return *Descriptors[MCI.getOpcode()];
  }