1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[llvm-mca][X86] Teach how to identify register writes that implicitly clear the upper portion of a super-register.

This patch teaches llvm-mca how to identify register writes that implicitly zero
the upper portion of a super-register.

On X86-64, a general purpose register is implemented in hardware as a 64-bit
register. Quoting the Intel 64 Software Developer's Manual: "an update to the
lower 32 bits of a 64 bit integer register is architecturally defined to zero
extend the upper 32 bits".  Also, a write to an XMM register performed by an AVX
instruction implicitly zeroes the upper 128 bits of the aliasing YMM register.

This patch adds a new method named clearsSuperRegisters to the MCInstrAnalysis
interface to help identify instructions that implicitly clear the upper portion
of a super-register.  The rest of the patch teaches llvm-mca how to use that new
method to obtain the information, and update the register dependencies
accordingly.

I compared the kernels from tests clear-super-register-1.s and
clear-super-register-2.s against the output from perf on btver2.  Previously
there was a large discrepancy between the estimated IPC and the measured IPC.
Now the differences are mostly in the noise.

Differential Revision: https://reviews.llvm.org/D48225

llvm-svn: 335113
This commit is contained in:
Andrea Di Biagio 2018-06-20 10:08:11 +00:00
parent 5a74ae2b63
commit 4893e095df
15 changed files with 325 additions and 200 deletions

View File

@ -22,6 +22,8 @@
namespace llvm {
class MCRegisterInfo;
class MCInstrAnalysis {
protected:
friend class Target;
@ -60,6 +62,31 @@ public:
return Info->get(Inst.getOpcode()).isTerminator();
}
/// Returns true if at least one of the register writes performed by
/// \param Inst implicitly clears the upper portion of all super-registers.
///
/// Example: on X86-64, a write to EAX implicitly clears the upper half of
/// RAX. Also (still on x86) an XMM write perfomed by an AVX 128-bit
/// instruction implicitly clears the upper portion of the correspondent
/// YMM register.
///
/// This method also updates an APInt which is used as mask of register
/// writes. There is one bit for every explicit/implicit write performed by
/// the instruction. If a write implicitly clears its super-registers, then
/// the corresponding bit is set (vic. the corresponding bit is cleared).
///
/// The first bits in the APint are related to explicit writes. The remaining
/// bits are related to implicit writes. The sequence of writes follows the
/// machine operand sequence. For implicit writes, the sequence is defined by
/// the MCInstrDesc.
///
/// The assumption is that the bit-width of the APInt is correctly set by
/// the caller. The default implementation conservatively assumes that none of
/// the writes clears the upper portion of a super-register.
virtual bool clearsSuperRegisters(const MCRegisterInfo &MRI,
const MCInst &Inst,
APInt &Writes) const;
/// Given a branch instruction try to get the address the branch
/// targets. Return true on success, and the address in Target.
virtual bool

View File

@ -8,6 +8,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/ADT/APInt.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
@ -15,6 +17,13 @@
using namespace llvm;
bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
const MCInst &Inst,
APInt &Writes) const {
Writes.clearAllBits();
return false;
}
bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const {
if (Inst.getNumOperands() == 0 ||

View File

@ -14,7 +14,9 @@
#include "X86MCTargetDesc.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "InstPrinter/X86IntelInstPrinter.h"
#include "X86BaseInfo.h"
#include "X86MCAsmInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/Triple.h"
#include "llvm/DebugInfo/CodeView/CodeView.h"
#include "llvm/MC/MCInstrAnalysis.h"
@ -293,8 +295,79 @@ static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
return llvm::createMCRelocationInfo(TheTriple, Ctx);
}
namespace llvm {
namespace X86_MC {
class X86MCInstrAnalysis : public MCInstrAnalysis {
X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
virtual ~X86MCInstrAnalysis() = default;
public:
X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
APInt &Mask) const override;
};
bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
const MCInst &Inst,
APInt &Mask) const {
const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
unsigned NumDefs = Desc.getNumDefs();
unsigned NumImplicitDefs = Desc.getNumImplicitDefs();
assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
"Unexpected number of bits in the mask!");
bool HasVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::VEX;
bool HasEVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX;
bool HasXOP = (Desc.TSFlags & X86II::EncodingMask) == X86II::XOP;
const MCRegisterClass &GR32RC = MRI.getRegClass(X86::GR32RegClassID);
const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID);
const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID);
auto ClearsSuperReg = [=](unsigned RegID) {
// On X86-64, a general purpose integer register is viewed as a 64-bit
// register internal to the processor.
// An update to the lower 32 bits of a 64 bit integer register is
// architecturally defined to zero extend the upper 32 bits.
if (GR32RC.contains(RegID))
return true;
// Early exit if this instruction has no vex/evex/xop prefix.
if (!HasEVEX && !HasVEX && !HasXOP)
return false;
// All VEX and EVEX encoded instructions are defined to zero the high bits
// of the destination register up to VLMAX (i.e. the maximum vector register
// width pertaining to the instruction).
// We assume the same behavior for XOP instructions too.
return VR128XRC.contains(RegID) || VR256XRC.contains(RegID);
};
Mask.clearAllBits();
for (unsigned I = 0, E = NumDefs; I < E; ++I) {
const MCOperand &Op = Inst.getOperand(I);
if (ClearsSuperReg(Op.getReg()))
Mask.setBit(I);
}
for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
const MCPhysReg Reg = Desc.getImplicitDefs()[I];
if (ClearsSuperReg(Reg))
Mask.setBit(NumDefs + I);
}
return Mask.getBoolValue();
}
} // end of namespace X86_MC
} // end of namespace llvm
static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
return new MCInstrAnalysis(Info);
return new X86_MC::X86MCInstrAnalysis(Info);
}
// Force static initialization.

View File

@ -15,9 +15,9 @@ bsf %rax, %rcx
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 6.0
# CHECK: Instruction Info:
@ -35,17 +35,17 @@ bsf %rax, %rcx
# CHECK-NEXT: 8 5 2.00 bsfq %rax, %rcx
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0123456
# CHECK-NEXT: 01234567
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . . .. imulq $5, %rcx, %rax
# CHECK-NEXT: [0,1] .DeE----R . . . .. lzcntl %ecx, %eax
# CHECK-NEXT: [0,2] .D=====eER. . . .. andq %rcx, %rax
# CHECK-NEXT: [0,3] . D=====eeeeeER. . .. bsfq %rax, %rcx
# CHECK-NEXT: [1,0] . .D======eeeeeeER .. imulq $5, %rcx, %rax
# CHECK-NEXT: [1,1] . . D=====eE-----R .. lzcntl %ecx, %eax
# CHECK-NEXT: [1,2] . . D===========eER .. andq %rcx, %rax
# CHECK-NEXT: [1,3] . . D===========eeeeeER bsfq %rax, %rcx
# CHECK: [0,0] DeeeeeeER . . . imulq $5, %rcx, %rax
# CHECK-NEXT: [0,1] .DeE----R . . . lzcntl %ecx, %eax
# CHECK-NEXT: [0,2] .D=eE----R. . . andq %rcx, %rax
# CHECK-NEXT: [0,3] . D=eeeeeER . . bsfq %rax, %rcx
# CHECK-NEXT: [1,0] . .D==eeeeeeER. imulq $5, %rcx, %rax
# CHECK-NEXT: [1,1] . . D=eE-----R. lzcntl %ecx, %eax
# CHECK-NEXT: [1,2] . . D==eE-----R andq %rcx, %rax
# CHECK-NEXT: [1,3] . . D==eeeeeER bsfq %rax, %rcx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -54,7 +54,7 @@ bsf %rax, %rcx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.0 0.5 0.0 imulq $5, %rcx, %rax
# CHECK-NEXT: 1. 2 3.5 0.5 4.5 lzcntl %ecx, %eax
# CHECK-NEXT: 2. 2 9.0 0.0 0.0 andq %rcx, %rax
# CHECK-NEXT: 3. 2 9.0 0.0 0.0 bsfq %rax, %rcx
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 imulq $5, %rcx, %rax
# CHECK-NEXT: 1. 2 1.5 0.5 4.5 lzcntl %ecx, %eax
# CHECK-NEXT: 2. 2 2.5 0.0 4.5 andq %rcx, %rax
# CHECK-NEXT: 3. 2 2.5 0.0 0.0 bsfq %rax, %rcx

View File

@ -33,9 +33,9 @@ vandps %xmm4, %xmm1, %xmm0
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1800
# CHECK-NEXT: Total Cycles: 7003
# CHECK-NEXT: Total Cycles: 3811
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 0.26
# CHECK-NEXT: IPC: 0.47
# CHECK-NEXT: Block RThroughput: 38.0
# CHECK: Instruction Info:
@ -67,27 +67,31 @@ vandps %xmm4, %xmm1, %xmm0
# CHECK-NEXT: 1 1 0.50 vandps %xmm4, %xmm1, %xmm0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789 0123456789 01234
# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789
# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . . vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: [0,2] . D====================================eeeER . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,3] . D=====================================eeeER . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,4] . D======================================eeeER . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,5] . D=======================================eeeER. . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,6] . .D========================================eeeER . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,7] . . D=========================================eeeER . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,8] . . D==========================================eeeER . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,9] . . D===========================================eeeER . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,10] . . D============================================eeeER. . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,11] . . .D=============================================eeeER . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,12] . . . D==============================================eeeER . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,13] . . . D===============================================eeeER . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,14] . . . D================================================eeeER . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,15] . . . D=================================================eeeER. . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,16] . . . .D==================================================eeeER . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,17] . . . . D====================================================eER . vandps %xmm4, %xmm1, %xmm0
# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . . . vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: [0,2] . D==eeeE--------------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,3] . D===eeeE------------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,4] . D====eeeE-----------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,5] . D=====eeeE---------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,6] . .D======eeeE--------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,7] . . D=======eeeE------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,8] . . D========eeeE-----------------------R. . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,9] . . D=========eeeE---------------------R. . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,10] . . D==========eeeE--------------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,11] . . .D===========eeeE------------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,12] . . . D============eeeE-----------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,13] . . . D=============eeeE---------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,14] . . . D==============eeeE--------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,15] . . . D===============eeeE------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,16] . . . .D================eeeE-----------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,17] . . . . D==================eE----------R . . . . . . . vandps %xmm4, %xmm1, %xmm0
# CHECK-NEXT: [1,0] . . . . D====================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER. vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: [1,1] . . . . D=================eeeE-------------------------------------R. vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: [1,2] . . . . D===================eeeE-----------------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,3] . . . . .D====================eeeE---------------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -96,21 +100,21 @@ vandps %xmm4, %xmm1, %xmm0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: 1. 1 1.0 1.0 34.0 vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: 2. 1 37.0 0.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 3. 1 38.0 2.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 4. 1 39.0 4.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 5. 1 40.0 6.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 6. 1 41.0 8.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 7. 1 42.0 10.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 8. 1 43.0 12.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 9. 1 44.0 14.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 10. 1 45.0 16.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 11. 1 46.0 18.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 12. 1 47.0 20.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 13. 1 48.0 22.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 14. 1 49.0 24.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 15. 1 50.0 26.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 16. 1 51.0 28.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 17. 1 53.0 0.0 0.0 vandps %xmm4, %xmm1, %xmm0
# CHECK-NEXT: 0. 2 11.0 1.5 0.0 vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: 1. 2 9.5 0.5 35.5 vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: 2. 2 11.5 0.0 33.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 3. 2 12.5 2.0 31.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 4. 1 5.0 4.0 29.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 5. 1 6.0 6.0 27.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 6. 1 7.0 7.0 26.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 7. 1 8.0 8.0 24.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 8. 1 9.0 9.0 23.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 9. 1 10.0 10.0 21.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 10. 1 11.0 11.0 20.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 11. 1 12.0 12.0 18.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 12. 1 13.0 13.0 17.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 13. 1 14.0 14.0 15.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 14. 1 15.0 15.0 14.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 15. 1 16.0 16.0 12.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 16. 1 17.0 17.0 11.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 17. 1 19.0 0.0 10.0 vandps %xmm4, %xmm1, %xmm0

View File

@ -10,9 +10,9 @@
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total Cycles: 318
# CHECK-NEXT: Dispatch Width: 4
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: IPC: 1.89
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@ -55,21 +55,21 @@
# CHECK-NEXT: - - - 1.00 - - - - vaddps %xmm4, %xmm5, %xmm0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 01234
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0123456
# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vaddps %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %xmm4, %xmm5, %xmm0
# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vaddps %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %xmm4, %xmm5, %xmm0
# CHECK: [0,0] DeeeeeER . . . .. vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [0,1] DeeeE--R . . . .. vaddps %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %xmm4, %xmm5, %xmm0
# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [1,1] .DeeeE--------------------R vaddps %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %xmm4, %xmm5, %xmm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -78,9 +78,9 @@
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vaddps %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %xmm4, %xmm5, %xmm0
# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vaddps %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm5, %xmm0

View File

@ -10,9 +10,9 @@
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total Cycles: 318
# CHECK-NEXT: Dispatch Width: 4
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: IPC: 1.89
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@ -55,21 +55,21 @@
# CHECK-NEXT: - - - 1.00 - - - - vaddps %xmm4, %xmm5, %xmm0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 01234
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0123456
# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vaddps %ymm1, %ymm1, %ymm2
# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %zmm2, %zmm3, %zmm4
# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %xmm4, %xmm5, %xmm0
# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vaddps %ymm1, %ymm1, %ymm2
# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %zmm2, %zmm3, %zmm4
# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %xmm4, %xmm5, %xmm0
# CHECK: [0,0] DeeeeeER . . . .. vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [0,1] DeeeE--R . . . .. vaddps %ymm1, %ymm1, %ymm2
# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %zmm2, %zmm3, %zmm4
# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %xmm4, %xmm5, %xmm0
# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [1,1] .DeeeE--------------------R vaddps %ymm1, %ymm1, %ymm2
# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %zmm2, %zmm3, %zmm4
# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %xmm4, %xmm5, %xmm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -78,9 +78,9 @@
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vaddps %ymm1, %ymm1, %ymm2
# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %zmm2, %zmm3, %zmm4
# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %xmm4, %xmm5, %xmm0
# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vaddps %ymm1, %ymm1, %ymm2
# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %zmm2, %zmm3, %zmm4
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm5, %xmm0

View File

@ -10,9 +10,9 @@
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total Cycles: 318
# CHECK-NEXT: Dispatch Width: 4
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: IPC: 1.89
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@ -55,21 +55,21 @@
# CHECK-NEXT: - - - 1.00 - - - - vaddps %xmm4, %xmm20, %xmm0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 01234
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0123456
# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vaddps %xmm16, %xmm17, %xmm2
# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %xmm4, %xmm20, %xmm0
# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vaddps %xmm16, %xmm17, %xmm2
# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %xmm4, %xmm20, %xmm0
# CHECK: [0,0] DeeeeeER . . . .. vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [0,1] DeeeE--R . . . .. vaddps %xmm16, %xmm17, %xmm2
# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %xmm4, %xmm20, %xmm0
# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: [1,1] .DeeeE--------------------R vaddps %xmm16, %xmm17, %xmm2
# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %xmm4, %xmm20, %xmm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -78,9 +78,9 @@
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vaddps %xmm16, %xmm17, %xmm2
# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %xmm4, %xmm20, %xmm0
# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2
# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vaddps %xmm16, %xmm17, %xmm2
# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm20, %xmm0

View File

@ -10,9 +10,9 @@
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total Cycles: 318
# CHECK-NEXT: Dispatch Width: 4
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: IPC: 1.89
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@ -55,21 +55,21 @@
# CHECK-NEXT: - - - 1.00 - - - - vaddps %ymm4, %ymm5, %ymm0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 01234
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0123456
# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vfrczpd %xmm1, %xmm2
# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %ymm4, %ymm5, %ymm0
# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vfrczpd %xmm1, %xmm2
# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %ymm4, %ymm5, %ymm0
# CHECK: [0,0] DeeeeeER . . . .. vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [0,1] DeeeE--R . . . .. vfrczpd %xmm1, %xmm2
# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %ymm4, %ymm5, %ymm0
# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [1,1] .DeeeE--------------------R vfrczpd %xmm1, %xmm2
# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %ymm4, %ymm5, %ymm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -78,9 +78,9 @@
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vfrczpd %xmm1, %xmm2
# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %ymm4, %ymm5, %ymm0
# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vfrczpd %xmm1, %xmm2
# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %ymm4, %ymm5, %ymm0

View File

@ -10,9 +10,9 @@
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total Cycles: 316
# CHECK-NEXT: Dispatch Width: 4
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: IPC: 1.90
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Instruction Info:
@ -55,21 +55,21 @@
# CHECK-NEXT: - - - 1.00 - - - - vaddps %ymm4, %ymm5, %ymm0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 01234
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 01234
# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [0,1] DeE----R . . . . . . . . vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %ymm4, %ymm5, %ymm0
# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [1,1] .DeE------------------------R . . . . vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %ymm4, %ymm5, %ymm0
# CHECK: [0,0] DeeeeeER . . . . vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [0,1] DeE----R . . . . vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
# CHECK-NEXT: [0,2] D=eeeeeER . . . . vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [0,3] D======eeeER . . . vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [0,4] .D========eeeeeER . . vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [0,5] .D=============eeeER. . vaddps %ymm4, %ymm5, %ymm0
# CHECK-NEXT: [1,0] .D================eeeeeER vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: [1,1] .DeE--------------------R vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
# CHECK-NEXT: [1,2] . DeeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: [1,3] . D=====eeeE------------R vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: [1,4] . D========eeeeeE-------R vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: [1,5] . D=============eeeE----R vaddps %ymm4, %ymm5, %ymm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -78,9 +78,9 @@
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 2 1.0 1.0 14.0 vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %ymm4, %ymm5, %ymm0
# CHECK-NEXT: 0. 2 9.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
# CHECK-NEXT: 2. 2 1.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4
# CHECK-NEXT: 3. 2 6.5 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 9.0 0.0 3.5 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 14.0 0.0 2.0 vaddps %ymm4, %ymm5, %ymm0

View File

@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "InstrBuilder.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/Debug.h"
@ -158,23 +159,6 @@ static void populateWrites(InstrDesc &ID, const MCInst &MCI,
const MCInstrDesc &MCDesc,
const MCSchedClassDesc &SCDesc,
const MCSubtargetInfo &STI) {
// Set if writes through this opcode may update super registers.
// TODO: on x86-64, a 4 byte write of a general purpose register always
// fully updates the super-register.
// More in general, (at least on x86) not all register writes perform
// a partial (super-)register update.
// For example, an AVX instruction that writes on a XMM register implicitly
// zeroes the upper half of every aliasing super-register.
//
// For now, we pessimistically assume that writes are all potentially
// partial register updates. This is a good default for most targets, execept
// for those like x86 which implement a special semantic for certain opcodes.
// At least on x86, this may lead to an inaccurate prediction of the
// instruction level parallelism.
bool FullyUpdatesSuperRegisters = false;
// Now Populate Writes.
// This algorithm currently works under the strong (and potentially incorrect)
// assumption that information related to register def/uses can be obtained
// from MCInstrDesc.
@ -275,7 +259,6 @@ static void populateWrites(InstrDesc &ID, const MCInst &MCI,
Write.Latency = ID.MaxLatency;
Write.SClassOrWriteResourceID = 0;
}
Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters;
Write.IsOptionalDef = false;
LLVM_DEBUG({
dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", Latency=" << Write.Latency
@ -488,16 +471,35 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
}
// Early exit if there are no writes.
if (D.Writes.empty())
return NewIS;
// Track register writes that implicitly clear the upper portion of the
// underlying super-registers using an APInt.
APInt WriteMask(D.Writes.size(), 0);
// Now query the MCInstrAnalysis object to obtain information about which
// register writes implicitly clear the upper portion of a super-register.
MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
// Initialize writes.
unsigned WriteIndex = 0;
for (const WriteDescriptor &WD : D.Writes) {
unsigned RegID =
WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg();
// Check if this is a optional definition that references NoReg.
if (WD.IsOptionalDef && !RegID)
if (WD.IsOptionalDef && !RegID) {
++WriteIndex;
continue;
}
assert(RegID && "Expected a valid register ID!");
NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(WD, RegID));
APInt CurrWriteMask = WriteMask & (1 << WriteIndex);
bool UpdatesSuperRegisters = CurrWriteMask.getBoolValue();
NewIS->getDefs().emplace_back(
llvm::make_unique<WriteState>(WD, RegID, UpdatesSuperRegisters));
++WriteIndex;
}
return NewIS;

View File

@ -17,7 +17,9 @@
#include "Instruction.h"
#include "Support.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
namespace mca {
@ -37,6 +39,8 @@ class DispatchUnit;
class InstrBuilder {
const llvm::MCSubtargetInfo &STI;
const llvm::MCInstrInfo &MCII;
const llvm::MCRegisterInfo &MRI;
const llvm::MCInstrAnalysis &MCIA;
llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
@ -48,8 +52,10 @@ class InstrBuilder {
InstrBuilder &operator=(const InstrBuilder &) = delete;
public:
InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii)
: STI(sti), MCII(mcii),
InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
const llvm::MCRegisterInfo &mri,
const llvm::MCInstrAnalysis &mcia)
: STI(sti), MCII(mcii), MRI(mri), MCIA(mcia),
ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
}

View File

@ -70,11 +70,6 @@ struct WriteDescriptor {
// This field is set to a value different than zero only if this
// is an implicit definition.
unsigned RegisterID;
// True if this write generates a partial update of a super-registers.
// On X86, this flag is set by byte/word writes on GPR registers. Also,
// a write of an XMM register only partially updates the corresponding
// YMM super-register if the write is associated to a legacy SSE instruction.
bool FullyUpdatesSuperRegs;
// Instruction itineraries would set this field to the SchedClass ID.
// Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry
// element associated to this write.
@ -129,6 +124,10 @@ class WriteState {
// field RegisterID from WD.
unsigned RegisterID;
// True if this write implicitly clears the upper portion of RegisterID's
// super-registers.
bool ClearsSuperRegs;
// A list of dependent reads. Users is a set of dependent
// reads. A dependent read is added to the set only if CyclesLeft
// is "unknown". As soon as CyclesLeft is 'known', each user in the set
@ -138,8 +137,10 @@ class WriteState {
std::set<std::pair<ReadState *, int>> Users;
public:
WriteState(const WriteDescriptor &Desc, unsigned RegID)
: WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID) {}
WriteState(const WriteDescriptor &Desc, unsigned RegID,
bool clearsSuperRegs = false)
: WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
ClearsSuperRegs(clearsSuperRegs) {}
WriteState(const WriteState &Other) = delete;
WriteState &operator=(const WriteState &Other) = delete;
@ -148,7 +149,7 @@ public:
unsigned getRegisterID() const { return RegisterID; }
void addUser(ReadState *Use, int ReadAdvance);
bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; }
bool clearsSuperRegisters() const { return ClearsSuperRegs; }
// On every cycle, update CyclesLeft and notify dependent users.
void cycleEvent();

View File

@ -138,7 +138,7 @@ void RegisterFile::addRegisterWrite(WriteState &WS,
allocatePhysRegs(Mapping.second, UsedPhysRegs);
// If this is a partial update, then we are done.
if (!WS.fullyUpdatesSuperRegs())
if (!WS.clearsSuperRegisters())
return;
for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
@ -149,7 +149,7 @@ void RegisterFile::removeRegisterWrite(const WriteState &WS,
MutableArrayRef<unsigned> FreedPhysRegs,
bool ShouldFreePhysRegs) {
unsigned RegID = WS.getRegisterID();
bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs();
bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters();
assert(RegID != 0 && "Invalidating an already invalid register?");
assert(WS.getCyclesLeft() != -512 &&

View File

@ -388,6 +388,9 @@ int main(int argc, char **argv) {
std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
std::unique_ptr<MCInstrAnalysis> MCIA(
TheTarget->createMCInstrAnalysis(MCII.get()));
if (!MCPU.compare("native"))
MCPU = llvm::sys::getHostCPUName();
@ -457,7 +460,7 @@ int main(int argc, char **argv) {
Width = DispatchWidth;
// Create an instruction builder.
mca::InstrBuilder IB(*STI, *MCII);
mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA);
// Number each region in the sequence.
unsigned RegionIdx = 0;