mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-18 10:32:48 +02:00
Revert "[MCA] [AMDGPU] Adding an implementation to AMDGPUCustomBehaviour for handling s_waitcnt instructions."
Build failures when building with shared libraries. Reverting until I can fix. Differential Revision: https://reviews.llvm.org/D104730
This commit is contained in:
parent
6e1952e313
commit
36584cd187
@ -137,7 +137,6 @@ def MIReadVGPR : SchedReadVariant<[
|
||||
// The latency values are 1 / (operations / cycle) / 4.
|
||||
multiclass SICommonWriteRes {
|
||||
|
||||
let RetireOOO = 1 in { // llvm-mca specific flag
|
||||
def : HWWriteRes<WriteBranch, [HWBranch], 8>;
|
||||
def : HWWriteRes<WriteExport, [HWExport], 4>;
|
||||
def : HWWriteRes<WriteLDS, [HWLGKM], 5>; // Can be between 2 and 64
|
||||
@ -160,7 +159,6 @@ multiclass SICommonWriteRes {
|
||||
def : HWWriteRes<Write8PassMAI, [HWXDL], 8>;
|
||||
let ResourceCycles = [16] in
|
||||
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
|
||||
} // End RetireOOO = 1
|
||||
|
||||
def : ReadAdvance<MIVGPRRead, -2>;
|
||||
|
||||
@ -184,7 +182,6 @@ let SchedModel = SIFullSpeedModel in {
|
||||
|
||||
defm : SICommonWriteRes;
|
||||
|
||||
let RetireOOO = 1 in { // llvm-mca specific flag
|
||||
def : HWVALUWriteRes<Write64Bit, 2>;
|
||||
def : HWVALUWriteRes<WriteIntMul, 4>;
|
||||
def : HWVALUWriteRes<WriteFloatFMA, 1>;
|
||||
@ -192,7 +189,6 @@ def : HWVALUWriteRes<WriteDouble, 4>;
|
||||
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
|
||||
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
|
||||
def : HWVALUWriteRes<WriteTrans64, 4>;
|
||||
} // End RetireOOO = 1
|
||||
|
||||
def : InstRW<[WriteCopy], (instrs COPY)>;
|
||||
|
||||
@ -202,7 +198,6 @@ let SchedModel = SIQuarterSpeedModel in {
|
||||
|
||||
defm : SICommonWriteRes;
|
||||
|
||||
let RetireOOO = 1 in { // llvm-mca specific flag
|
||||
def : HWVALUWriteRes<Write64Bit, 2>;
|
||||
def : HWVALUWriteRes<WriteIntMul, 4>;
|
||||
def : HWVALUWriteRes<WriteFloatFMA, 16>;
|
||||
@ -210,7 +205,6 @@ def : HWVALUWriteRes<WriteDouble, 16>;
|
||||
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
|
||||
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
|
||||
def : HWVALUWriteRes<WriteTrans64, 16>;
|
||||
} // End RetireOOO = 1
|
||||
|
||||
def : InstRW<[WriteCopy], (instrs COPY)>;
|
||||
def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
|
||||
@ -224,7 +218,6 @@ let SchedModel = SIDPFullSpeedModel in {
|
||||
|
||||
defm : SICommonWriteRes;
|
||||
|
||||
let RetireOOO = 1 in { // llvm-mca specific flag
|
||||
def : HWVALUWriteRes<WriteFloatFMA, 1>;
|
||||
def : HWVALUWriteRes<WriteDouble, 1>;
|
||||
def : HWVALUWriteRes<WriteDoubleAdd, 1>;
|
||||
@ -232,7 +225,6 @@ def : HWVALUWriteRes<WriteDoubleCvt, 1>;
|
||||
def : HWVALUWriteRes<WriteTrans64, 4>;
|
||||
def : HWVALUWriteRes<WriteIntMul, 1>;
|
||||
def : HWVALUWriteRes<Write64Bit, 1>;
|
||||
} // End RetireOOO = 1
|
||||
|
||||
def : InstRW<[WriteCopy], (instrs COPY)>;
|
||||
def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
|
||||
@ -248,7 +240,6 @@ let SchedModel = GFX10SpeedModel in {
|
||||
|
||||
// The latency values are 1 / (operations / cycle).
|
||||
// Add 1 stall cycle for VGPR read.
|
||||
let RetireOOO = 1 in { // llvm-mca specific flag
|
||||
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
|
||||
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
|
||||
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
|
||||
@ -268,7 +259,6 @@ def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
|
||||
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
|
||||
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
|
||||
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
|
||||
} // End RetireOOO = 1
|
||||
|
||||
def : InstRW<[WriteCopy], (instrs COPY)>;
|
||||
|
||||
|
@ -41,12 +41,12 @@ v_sqrt_f64 v[4:5], v[4:5]
|
||||
|
||||
# CHECK: Iterations: 1
|
||||
# CHECK-NEXT: Instructions: 28
|
||||
# CHECK-NEXT: Total Cycles: 205
|
||||
# CHECK-NEXT: Total Cycles: 224
|
||||
# CHECK-NEXT: Total uOps: 29
|
||||
|
||||
# CHECK: Dispatch Width: 1
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.14
|
||||
# CHECK-NEXT: IPC: 0.14
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.13
|
||||
# CHECK-NEXT: IPC: 0.13
|
||||
# CHECK-NEXT: Block RThroughput: 29.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
@ -133,37 +133,37 @@ v_sqrt_f64 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 1.00 - v_sqrt_f64_e32 v[4:5], v[4:5]
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789
|
||||
# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 01234
|
||||
# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789
|
||||
# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123
|
||||
|
||||
# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1]
|
||||
# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2
|
||||
# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5]
|
||||
# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6
|
||||
# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9]
|
||||
# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10
|
||||
# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1]
|
||||
# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fract_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_trunc_f64_e32 v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_ceil_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_rndne_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . v_floor_f64_e32 v[6:7], v[6:7]
|
||||
# CHECK-NEXT: [0,13] . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . v_add_f64 v[2:3], v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_mul_f64 v[4:5], v[4:5], v[4:5]
|
||||
# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_min_f64 v[6:7], v[6:7], v[6:7]
|
||||
# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_max_f64 v[8:9], v[8:9], v[8:9]
|
||||
# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . v_ldexp_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
|
||||
# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE . v_rcp_f64_e32 v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE. v_rsq_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE v_sqrt_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1]
|
||||
# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2
|
||||
# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5]
|
||||
# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6
|
||||
# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9]
|
||||
# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10
|
||||
# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1]
|
||||
# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fract_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_trunc_f64_e32 v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_ceil_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_rndne_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_floor_f64_e32 v[6:7], v[6:7]
|
||||
# CHECK-NEXT: [0,13] . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . v_add_f64 v[2:3], v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mul_f64 v[4:5], v[4:5], v[4:5]
|
||||
# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_min_f64 v[6:7], v[6:7], v[6:7]
|
||||
# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_max_f64 v[8:9], v[8:9], v[8:9]
|
||||
# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . v_ldexp_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
|
||||
# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE . v_rcp_f64_e32 v[0:1], v[0:1]
|
||||
# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE. v_rsq_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE v_sqrt_f64_e32 v[4:5], v[4:5]
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
|
@ -1,233 +0,0 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx900 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
|
||||
|
||||
s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
s_waitcnt lgkmcnt(0)
|
||||
v_mov_b32_e32 v0, s2
|
||||
v_mov_b32_e32 v1, s3
|
||||
flat_load_dword v2, v[0:1]
|
||||
flat_load_dword v3, v[0:1] offset:8
|
||||
flat_load_dword v4, v[0:1] offset:16
|
||||
flat_load_dword v5, v[0:1] offset:24
|
||||
v_mov_b32_e32 v0, s0
|
||||
v_mov_b32_e32 v1, s1
|
||||
v_mov_b32_e32 v6, s6
|
||||
v_mov_b32_e32 v7, s7
|
||||
v_mov_b32_e32 v8, s8
|
||||
v_mov_b32_e32 v9, s9
|
||||
v_mov_b32_e32 v10, s10
|
||||
v_mov_b32_e32 v11, s11
|
||||
v_mov_b32_e32 v12, s12
|
||||
v_mov_b32_e32 v13, s13
|
||||
v_mov_b32_e32 v14, s14
|
||||
v_mov_b32_e32 v15, s15
|
||||
v_mov_b32_e32 v16, s16
|
||||
v_mov_b32_e32 v17, s17
|
||||
v_mov_b32_e32 v18, s18
|
||||
v_mov_b32_e32 v19, s19
|
||||
v_mov_b32_e32 v20, s20
|
||||
v_mov_b32_e32 v21, s21
|
||||
v_mov_b32_e32 v22, s22
|
||||
v_mov_b32_e32 v23, s23
|
||||
v_mov_b32_e32 v24, s24
|
||||
v_mov_b32_e32 v25, s25
|
||||
v_mov_b32_e32 v26, s26
|
||||
v_mov_b32_e32 v27, s27
|
||||
v_mov_b32_e32 v28, s28
|
||||
v_mov_b32_e32 v29, s29
|
||||
s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
|
||||
# CHECK: Iterations: 1
|
||||
# CHECK-NEXT: Instructions: 36
|
||||
# CHECK-NEXT: Total Cycles: 94
|
||||
# CHECK-NEXT: Total uOps: 36
|
||||
|
||||
# CHECK: Dispatch Width: 1
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.38
|
||||
# CHECK-NEXT: IPC: 0.38
|
||||
# CHECK-NEXT: Block RThroughput: 36.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
# CHECK-NEXT: [1]: #uOps
|
||||
# CHECK-NEXT: [2]: Latency
|
||||
# CHECK-NEXT: [3]: RThroughput
|
||||
# CHECK-NEXT: [4]: MayLoad
|
||||
# CHECK-NEXT: [5]: MayStore
|
||||
# CHECK-NEXT: [6]: HasSideEffects (U)
|
||||
|
||||
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
|
||||
# CHECK-NEXT: 1 5 1.00 * s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
# CHECK-NEXT: 1 5 1.00 * s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
# CHECK-NEXT: 1 1 1.00 U s_waitcnt lgkmcnt(0)
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v0, s2
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v1, s3
|
||||
# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v2, v[0:1]
|
||||
# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v3, v[0:1] offset:8
|
||||
# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v4, v[0:1] offset:16
|
||||
# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v5, v[0:1] offset:24
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v0, s0
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v1, s1
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v6, s6
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v7, s7
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v8, s8
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v9, s9
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v10, s10
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v11, s11
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v12, s12
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v13, s13
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v14, s14
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v15, s15
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v16, s16
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v17, s17
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v18, s18
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v19, s19
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v20, s20
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v21, s21
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v22, s22
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v23, s23
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v24, s24
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v25, s25
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v26, s26
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v27, s27
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v28, s28
|
||||
# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v29, s29
|
||||
# CHECK-NEXT: 1 1 1.00 U s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
|
||||
# CHECK: Resources:
|
||||
# CHECK-NEXT: [0] - HWBranch
|
||||
# CHECK-NEXT: [1] - HWExport
|
||||
# CHECK-NEXT: [2] - HWLGKM
|
||||
# CHECK-NEXT: [3] - HWSALU
|
||||
# CHECK-NEXT: [4] - HWVALU
|
||||
# CHECK-NEXT: [5] - HWVMEM
|
||||
# CHECK-NEXT: [6] - HWXDL
|
||||
|
||||
# CHECK: Resource pressure per iteration:
|
||||
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
|
||||
# CHECK-NEXT: - - 2.00 2.00 28.00 4.00 -
|
||||
|
||||
# CHECK: Resource pressure by instruction:
|
||||
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
|
||||
# CHECK-NEXT: - - 1.00 - - - - s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
# CHECK-NEXT: - - 1.00 - - - - s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
# CHECK-NEXT: - - - 1.00 - - - s_waitcnt lgkmcnt(0)
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v0, s2
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v1, s3
|
||||
# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v2, v[0:1]
|
||||
# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v3, v[0:1] offset:8
|
||||
# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v4, v[0:1] offset:16
|
||||
# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v5, v[0:1] offset:24
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v0, s0
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v1, s1
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v6, s6
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v7, s7
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v8, s8
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v9, s9
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v10, s10
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v11, s11
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v12, s12
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v13, s13
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v14, s14
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v15, s15
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v16, s16
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v17, s17
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v18, s18
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v19, s19
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v20, s20
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v21, s21
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v22, s22
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v23, s23
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v24, s24
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v25, s25
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v26, s26
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v27, s27
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v28, s28
|
||||
# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v29, s29
|
||||
# CHECK-NEXT: - - - 1.00 - - - s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123
|
||||
# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789
|
||||
|
||||
# CHECK: [0,0] DeeeeE . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
# CHECK-NEXT: [0,1] .DeeeeE . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
# CHECK-NEXT: [0,2] . .DE . . . . . . . . . . . . . . . . . . s_waitcnt lgkmcnt(0)
|
||||
# CHECK-NEXT: [0,3] . . DE . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s2
|
||||
# CHECK-NEXT: [0,4] . . DE. . . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s3
|
||||
# CHECK-NEXT: [0,5] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . flat_load_dword v2, v[0:1]
|
||||
# CHECK-NEXT: [0,6] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE . flat_load_dword v3, v[0:1] offset:8
|
||||
# CHECK-NEXT: [0,7] . . .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE . flat_load_dword v4, v[0:1] offset:16
|
||||
# CHECK-NEXT: [0,8] . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. flat_load_dword v5, v[0:1] offset:24
|
||||
# CHECK-NEXT: [0,9] . . . DE. . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s0
|
||||
# CHECK-NEXT: [0,10] . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s1
|
||||
# CHECK-NEXT: [0,11] . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v6, s6
|
||||
# CHECK-NEXT: [0,12] . . . .DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v7, s7
|
||||
# CHECK-NEXT: [0,13] . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v8, s8
|
||||
# CHECK-NEXT: [0,14] . . . . DE. . . . . . . . . . . . . . . . v_mov_b32_e32 v9, s9
|
||||
# CHECK-NEXT: [0,15] . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v10, s10
|
||||
# CHECK-NEXT: [0,16] . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v11, s11
|
||||
# CHECK-NEXT: [0,17] . . . . .DE . . . . . . . . . . . . . . . v_mov_b32_e32 v12, s12
|
||||
# CHECK-NEXT: [0,18] . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v13, s13
|
||||
# CHECK-NEXT: [0,19] . . . . . DE. . . . . . . . . . . . . . . v_mov_b32_e32 v14, s14
|
||||
# CHECK-NEXT: [0,20] . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v15, s15
|
||||
# CHECK-NEXT: [0,21] . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v16, s16
|
||||
# CHECK-NEXT: [0,22] . . . . . .DE . . . . . . . . . . . . . . v_mov_b32_e32 v17, s17
|
||||
# CHECK-NEXT: [0,23] . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v18, s18
|
||||
# CHECK-NEXT: [0,24] . . . . . . DE. . . . . . . . . . . . . . v_mov_b32_e32 v19, s19
|
||||
# CHECK-NEXT: [0,25] . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v20, s20
|
||||
# CHECK-NEXT: [0,26] . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v21, s21
|
||||
# CHECK-NEXT: [0,27] . . . . . . .DE . . . . . . . . . . . . . v_mov_b32_e32 v22, s22
|
||||
# CHECK-NEXT: [0,28] . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v23, s23
|
||||
# CHECK-NEXT: [0,29] . . . . . . . DE. . . . . . . . . . . . . v_mov_b32_e32 v24, s24
|
||||
# CHECK-NEXT: [0,30] . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v25, s25
|
||||
# CHECK-NEXT: [0,31] . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v26, s26
|
||||
# CHECK-NEXT: [0,32] . . . . . . . .DE . . . . . . . . . . . . v_mov_b32_e32 v27, s27
|
||||
# CHECK-NEXT: [0,33] . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v28, s28
|
||||
# CHECK-NEXT: [0,34] . . . . . . . . DE. . . . . . . . . . . . v_mov_b32_e32 v29, s29
|
||||
# CHECK-NEXT: [0,35] . . . . . . . . . . . . . . . . . . . DE s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
|
||||
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
|
||||
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
|
||||
|
||||
# CHECK: [0] [1] [2] [3]
|
||||
# CHECK-NEXT: 0. 1 0.0 0.0 0.0 s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
# CHECK-NEXT: 1. 1 0.0 0.0 0.0 s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
# CHECK-NEXT: 2. 1 0.0 0.0 0.0 s_waitcnt lgkmcnt(0)
|
||||
# CHECK-NEXT: 3. 1 0.0 0.0 0.0 v_mov_b32_e32 v0, s2
|
||||
# CHECK-NEXT: 4. 1 0.0 0.0 0.0 v_mov_b32_e32 v1, s3
|
||||
# CHECK-NEXT: 5. 1 0.0 0.0 0.0 flat_load_dword v2, v[0:1]
|
||||
# CHECK-NEXT: 6. 1 0.0 0.0 0.0 flat_load_dword v3, v[0:1] offset:8
|
||||
# CHECK-NEXT: 7. 1 0.0 0.0 0.0 flat_load_dword v4, v[0:1] offset:16
|
||||
# CHECK-NEXT: 8. 1 0.0 0.0 0.0 flat_load_dword v5, v[0:1] offset:24
|
||||
# CHECK-NEXT: 9. 1 0.0 0.0 0.0 v_mov_b32_e32 v0, s0
|
||||
# CHECK-NEXT: 10. 1 0.0 0.0 0.0 v_mov_b32_e32 v1, s1
|
||||
# CHECK-NEXT: 11. 1 0.0 0.0 0.0 v_mov_b32_e32 v6, s6
|
||||
# CHECK-NEXT: 12. 1 0.0 0.0 0.0 v_mov_b32_e32 v7, s7
|
||||
# CHECK-NEXT: 13. 1 0.0 0.0 0.0 v_mov_b32_e32 v8, s8
|
||||
# CHECK-NEXT: 14. 1 0.0 0.0 0.0 v_mov_b32_e32 v9, s9
|
||||
# CHECK-NEXT: 15. 1 0.0 0.0 0.0 v_mov_b32_e32 v10, s10
|
||||
# CHECK-NEXT: 16. 1 0.0 0.0 0.0 v_mov_b32_e32 v11, s11
|
||||
# CHECK-NEXT: 17. 1 0.0 0.0 0.0 v_mov_b32_e32 v12, s12
|
||||
# CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_mov_b32_e32 v13, s13
|
||||
# CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_mov_b32_e32 v14, s14
|
||||
# CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_mov_b32_e32 v15, s15
|
||||
# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_mov_b32_e32 v16, s16
|
||||
# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_mov_b32_e32 v17, s17
|
||||
# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_mov_b32_e32 v18, s18
|
||||
# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_mov_b32_e32 v19, s19
|
||||
# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_mov_b32_e32 v20, s20
|
||||
# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_mov_b32_e32 v21, s21
|
||||
# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_mov_b32_e32 v22, s22
|
||||
# CHECK-NEXT: 28. 1 0.0 0.0 0.0 v_mov_b32_e32 v23, s23
|
||||
# CHECK-NEXT: 29. 1 0.0 0.0 0.0 v_mov_b32_e32 v24, s24
|
||||
# CHECK-NEXT: 30. 1 0.0 0.0 0.0 v_mov_b32_e32 v25, s25
|
||||
# CHECK-NEXT: 31. 1 0.0 0.0 0.0 v_mov_b32_e32 v26, s26
|
||||
# CHECK-NEXT: 32. 1 0.0 0.0 0.0 v_mov_b32_e32 v27, s27
|
||||
# CHECK-NEXT: 33. 1 0.0 0.0 0.0 v_mov_b32_e32 v28, s28
|
||||
# CHECK-NEXT: 34. 1 0.0 0.0 0.0 v_mov_b32_e32 v29, s29
|
||||
# CHECK-NEXT: 35. 1 0.0 0.0 0.0 s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
|
@ -19,311 +19,15 @@
|
||||
namespace llvm {
|
||||
namespace mca {
|
||||
|
||||
void AMDGPUInstrPostProcess::postProcessInstruction(
|
||||
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
|
||||
switch (MCI.getOpcode()) {
|
||||
case AMDGPU::S_WAITCNT:
|
||||
case AMDGPU::S_WAITCNT_EXPCNT:
|
||||
case AMDGPU::S_WAITCNT_LGKMCNT:
|
||||
case AMDGPU::S_WAITCNT_VMCNT:
|
||||
case AMDGPU::S_WAITCNT_VSCNT:
|
||||
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_VSCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_gfx6_gfx7:
|
||||
case AMDGPU::S_WAITCNT_vi:
|
||||
return processWaitCnt(Inst, MCI);
|
||||
}
|
||||
}
|
||||
|
||||
// s_waitcnt instructions encode important information as immediate operands
|
||||
// which are lost during the MCInst -> mca::Instruction lowering.
|
||||
void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
|
||||
const MCInst &MCI) {
|
||||
for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
|
||||
MCAOperand Op;
|
||||
const MCOperand &MCOp = MCI.getOperand(Idx);
|
||||
if (MCOp.isReg()) {
|
||||
Op = MCAOperand::createReg(MCOp.getReg());
|
||||
} else if (MCOp.isImm()) {
|
||||
Op = MCAOperand::createImm(MCOp.getImm());
|
||||
}
|
||||
Op.setIndex(Idx);
|
||||
Inst->addOperand(Op);
|
||||
}
|
||||
}
|
||||
|
||||
AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
|
||||
const SourceMgr &SrcMgr,
|
||||
const MCInstrInfo &MCII)
|
||||
: CustomBehaviour(STI, SrcMgr, MCII) {
|
||||
generateWaitCntInfo();
|
||||
}
|
||||
: CustomBehaviour(STI, SrcMgr, MCII) {}
|
||||
|
||||
unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
|
||||
const InstRef &IR) {
|
||||
const Instruction &Inst = *IR.getInstruction();
|
||||
unsigned Opcode = Inst.getOpcode();
|
||||
|
||||
// llvm-mca is generally run on fully compiled assembly so we wouldn't see any
|
||||
// pseudo instructions here. However, there are plans for the future to make
|
||||
// it possible to use mca within backend passes. As such, I have left the
|
||||
// pseudo version of s_waitcnt within this switch statement.
|
||||
switch (Opcode) {
|
||||
default:
|
||||
return 0;
|
||||
case AMDGPU::S_WAITCNT: // This instruction
|
||||
case AMDGPU::S_WAITCNT_EXPCNT:
|
||||
case AMDGPU::S_WAITCNT_LGKMCNT:
|
||||
case AMDGPU::S_WAITCNT_VMCNT:
|
||||
case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
|
||||
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_VSCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_gfx6_gfx7:
|
||||
case AMDGPU::S_WAITCNT_vi:
|
||||
// s_endpgm also behaves as if there is an implicit
|
||||
// s_waitcnt 0, but I'm not sure if it would be appropriate
|
||||
// to model this in llvm-mca based on how the iterations work
|
||||
// while simulating the pipeline over and over.
|
||||
return handleWaitCnt(IssuedInst, IR);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
|
||||
const InstRef &IR) {
|
||||
// Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
|
||||
// I do not know how that instruction works so I did not attempt to model it.
|
||||
// set the max values to begin
|
||||
unsigned Vmcnt = 63;
|
||||
unsigned Expcnt = 7;
|
||||
unsigned Lgkmcnt = 31;
|
||||
unsigned Vscnt = 63;
|
||||
unsigned CurrVmcnt = 0;
|
||||
unsigned CurrExpcnt = 0;
|
||||
unsigned CurrLgkmcnt = 0;
|
||||
unsigned CurrVscnt = 0;
|
||||
unsigned CyclesToWaitVm = ~0U;
|
||||
unsigned CyclesToWaitExp = ~0U;
|
||||
unsigned CyclesToWaitLgkm = ~0U;
|
||||
unsigned CyclesToWaitVs = ~0U;
|
||||
|
||||
computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
|
||||
|
||||
// We will now look at each of the currently executing instructions
|
||||
// to find out if this wait instruction still needs to wait.
|
||||
for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
|
||||
const InstRef &PrevIR = *I;
|
||||
const Instruction &PrevInst = *PrevIR.getInstruction();
|
||||
const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
|
||||
const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
|
||||
const int CyclesLeft = PrevInst.getCyclesLeft();
|
||||
assert(CyclesLeft != UNKNOWN_CYCLES &&
|
||||
"We should know how many cycles are left for this instruction");
|
||||
if (PrevInstWaitInfo.VmCnt) {
|
||||
CurrVmcnt++;
|
||||
if ((unsigned)CyclesLeft < CyclesToWaitVm)
|
||||
CyclesToWaitVm = CyclesLeft;
|
||||
}
|
||||
if (PrevInstWaitInfo.ExpCnt) {
|
||||
CurrExpcnt++;
|
||||
if ((unsigned)CyclesLeft < CyclesToWaitExp)
|
||||
CyclesToWaitExp = CyclesLeft;
|
||||
}
|
||||
if (PrevInstWaitInfo.LgkmCnt) {
|
||||
CurrLgkmcnt++;
|
||||
if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
|
||||
CyclesToWaitLgkm = CyclesLeft;
|
||||
}
|
||||
if (PrevInstWaitInfo.VsCnt) {
|
||||
CurrVscnt++;
|
||||
if ((unsigned)CyclesLeft < CyclesToWaitVs)
|
||||
CyclesToWaitVs = CyclesLeft;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned CyclesToWait = ~0U;
|
||||
if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
|
||||
CyclesToWait = CyclesToWaitVm;
|
||||
if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
|
||||
CyclesToWait = CyclesToWaitExp;
|
||||
if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
|
||||
CyclesToWait = CyclesToWaitLgkm;
|
||||
if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
|
||||
CyclesToWait = CyclesToWaitVs;
|
||||
|
||||
// We may underestimate how many cycles we need to wait, but this
|
||||
// isn't a big deal. Our return value is just how many cycles until
|
||||
// this function gets run again. So as long as we don't overestimate
|
||||
// the wait time, we'll still end up stalling at this instruction
|
||||
// for the correct number of cycles.
|
||||
|
||||
if (CyclesToWait == ~0U)
|
||||
return 0;
|
||||
return CyclesToWait;
|
||||
}
|
||||
|
||||
void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
|
||||
unsigned &Expcnt, unsigned &Lgkmcnt,
|
||||
unsigned &Vscnt) {
|
||||
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
|
||||
const Instruction &Inst = *IR.getInstruction();
|
||||
unsigned Opcode = Inst.getOpcode();
|
||||
|
||||
switch (Opcode) {
|
||||
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
|
||||
// Should probably be checking for nullptr
|
||||
// here, but I'm not sure how I should handle the case
|
||||
// where we see a nullptr.
|
||||
const MCAOperand *OpReg = Inst.getOperand(0);
|
||||
const MCAOperand *OpImm = Inst.getOperand(1);
|
||||
assert(OpReg && OpReg->isReg() && "First operand should be a register.");
|
||||
assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
|
||||
if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
|
||||
// Instruction is using a real register.
|
||||
// Since we can't know what value this register will have,
|
||||
// we can't compute what the value of this wait should be.
|
||||
WithColor::warning() << "The register component of "
|
||||
<< MCII.getName(Opcode) << " will be completely "
|
||||
<< "ignored. So the wait may not be accurate.\n";
|
||||
}
|
||||
switch (Opcode) {
|
||||
// Redundant switch so I don't have to repeat the code above
|
||||
// for each case. There are more clever ways to avoid this
|
||||
// extra switch and anyone can feel free to implement one of them.
|
||||
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
|
||||
Expcnt = OpImm->getImm();
|
||||
break;
|
||||
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
|
||||
Lgkmcnt = OpImm->getImm();
|
||||
break;
|
||||
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
|
||||
Vmcnt = OpImm->getImm();
|
||||
break;
|
||||
case AMDGPU::S_WAITCNT_VSCNT_gfx10:
|
||||
Vscnt = OpImm->getImm();
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
case AMDGPU::S_WAITCNT_gfx10:
|
||||
case AMDGPU::S_WAITCNT_gfx6_gfx7:
|
||||
case AMDGPU::S_WAITCNT_vi:
|
||||
unsigned WaitCnt = Inst.getOperand(0)->getImm();
|
||||
AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void AMDGPUCustomBehaviour::generateWaitCntInfo() {
|
||||
// The core logic from this function is taken from
|
||||
// SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
|
||||
// that are being looked at are in the MachineInstr format, whereas we have
|
||||
// access to the MCInst format. The side effects of this are that we can't use
|
||||
// the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
|
||||
// functions. Therefore, we conservatively assume that these functions will
|
||||
// return true. This may cause a few instructions to be incorrectly tagged
|
||||
// with an extra CNT. However, these are instructions that do interact with at
|
||||
// least one CNT so giving them an extra CNT shouldn't cause issues in most
|
||||
// scenarios.
|
||||
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
|
||||
InstrWaitCntInfo.resize(SrcMgr.size());
|
||||
|
||||
int Index = 0;
|
||||
for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
|
||||
const std::unique_ptr<Instruction> &Inst = *I;
|
||||
unsigned Opcode = Inst->getOpcode();
|
||||
const MCInstrDesc &MCID = MCII.get(Opcode);
|
||||
if ((MCID.TSFlags & SIInstrFlags::DS) &&
|
||||
(MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
|
||||
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||||
if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
|
||||
InstrWaitCntInfo[Index].ExpCnt = true;
|
||||
} else if (MCID.TSFlags & SIInstrFlags::FLAT) {
|
||||
// We conservatively assume that mayAccessVMEMThroughFlat(Inst)
|
||||
// and mayAccessLDSThroughFlat(Inst) would both return true for this
|
||||
// instruction. We have to do this because those functions use
|
||||
// information about the memory operands that we don't have access to.
|
||||
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||||
if (!STI.hasFeature(AMDGPU::FeatureVscnt))
|
||||
InstrWaitCntInfo[Index].VmCnt = true;
|
||||
else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
|
||||
InstrWaitCntInfo[Index].VmCnt = true;
|
||||
else
|
||||
InstrWaitCntInfo[Index].VsCnt = true;
|
||||
} else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
|
||||
if (!STI.hasFeature(AMDGPU::FeatureVscnt))
|
||||
InstrWaitCntInfo[Index].VmCnt = true;
|
||||
else if ((MCID.mayLoad() &&
|
||||
!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
|
||||
((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
|
||||
!MCID.mayStore()))
|
||||
InstrWaitCntInfo[Index].VmCnt = true;
|
||||
else if (MCID.mayStore())
|
||||
InstrWaitCntInfo[Index].VsCnt = true;
|
||||
|
||||
// (IV.Major < 7) is meant to represent
|
||||
// GCNTarget.vmemWriteNeedsExpWaitcnt()
|
||||
// which is defined as
|
||||
// { return getGeneration() < SEA_ISLANDS; }
|
||||
if (IV.Major < 7 &&
|
||||
(MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
|
||||
InstrWaitCntInfo[Index].ExpCnt = true;
|
||||
} else if (MCID.TSFlags & SIInstrFlags::SMRD) {
|
||||
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||||
} else if (MCID.TSFlags & SIInstrFlags::EXP) {
|
||||
InstrWaitCntInfo[Index].ExpCnt = true;
|
||||
} else {
|
||||
switch (Opcode) {
|
||||
case AMDGPU::S_SENDMSG:
|
||||
case AMDGPU::S_SENDMSGHALT:
|
||||
case AMDGPU::S_MEMTIME:
|
||||
case AMDGPU::S_MEMREALTIME:
|
||||
InstrWaitCntInfo[Index].LgkmCnt = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// taken from SIInstrInfo::isVMEM()
|
||||
bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
|
||||
return MCID.TSFlags & SIInstrFlags::MUBUF ||
|
||||
MCID.TSFlags & SIInstrFlags::MTBUF ||
|
||||
MCID.TSFlags & SIInstrFlags::MIMG;
|
||||
}
|
||||
|
||||
// taken from SIInstrInfo::hasModifiersSet()
|
||||
bool AMDGPUCustomBehaviour::hasModifiersSet(
|
||||
const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
|
||||
int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
|
||||
if (Idx == -1)
|
||||
return false;
|
||||
|
||||
const MCAOperand *Op = Inst->getOperand(Idx);
|
||||
if (Op == nullptr || !Op->isImm() || !Op->getImm())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// taken from SIInstrInfo::isAlwaysGDS()
|
||||
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
|
||||
return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
|
||||
Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
|
||||
Opcode == AMDGPU::DS_GWS_SEMA_P ||
|
||||
Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
|
||||
Opcode == AMDGPU::DS_GWS_BARRIER;
|
||||
}
|
||||
|
||||
} // namespace mca
|
||||
} // namespace llvm
|
||||
|
@ -23,8 +23,6 @@ namespace llvm {
|
||||
namespace mca {
|
||||
|
||||
class AMDGPUInstrPostProcess : public InstrPostProcess {
|
||||
void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
|
||||
|
||||
public:
|
||||
AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
|
||||
: InstrPostProcess(STI, MCII) {}
|
||||
@ -32,54 +30,10 @@ public:
|
||||
~AMDGPUInstrPostProcess() {}
|
||||
|
||||
void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
|
||||
const MCInst &MCI) override;
|
||||
};
|
||||
|
||||
struct WaitCntInfo {
|
||||
bool VmCnt = false;
|
||||
bool ExpCnt = false;
|
||||
bool LgkmCnt = false;
|
||||
bool VsCnt = false;
|
||||
const MCInst &MCI) override {}
|
||||
};
|
||||
|
||||
class AMDGPUCustomBehaviour : public CustomBehaviour {
|
||||
/// Whenever MCA would like to dispatch an s_waitcnt instructions,
|
||||
/// we must check all the instruction that are still executing to see if
|
||||
/// they modify the same CNT as we need to wait for. This vector
|
||||
/// gets built in the constructor and contains 1 WaitCntInfo struct
|
||||
/// for each instruction within the SrcManager. Each element
|
||||
/// tells us which CNTs that instruction may interact with.
|
||||
/// We conservatively assume some instructions interact with more
|
||||
/// CNTs than they do in reality, so we will occasionally wait
|
||||
/// longer than necessary, but we shouldn't ever wait for shorter.
|
||||
std::vector<WaitCntInfo> InstrWaitCntInfo;
|
||||
|
||||
/// This method gets called from the constructor and is
|
||||
/// where we setup the InstrWaitCntInfo vector.
|
||||
/// The core logic for determining which CNTs an instruction
|
||||
/// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter().
|
||||
/// Unfortunately, some of the logic from that function is not avalable to us
|
||||
/// in this scope so we conservatively end up assuming that some
|
||||
/// instructions interact with more CNTs than they do in reality.
|
||||
void generateWaitCntInfo();
|
||||
/// Helper function used in generateWaitCntInfo()
|
||||
bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
|
||||
unsigned OpName) const;
|
||||
/// Helper function used in generateWaitCntInfo()
|
||||
bool isAlwaysGDS(uint16_t Opcode) const;
|
||||
/// Helper function used in generateWaitCntInfo()
|
||||
bool isVMEM(const MCInstrDesc &MCID);
|
||||
/// This method gets called from checkCustomHazard when mca is attempting to
|
||||
/// dispatch an s_waitcnt instruction (or one of its variants). The method
|
||||
/// looks at each of the instructions that are still executing in the pipeline
|
||||
/// to determine if the waitcnt should force a wait.
|
||||
unsigned handleWaitCnt(ArrayRef<InstRef> IssuedInst, const InstRef &IR);
|
||||
/// Based on the type of s_waitcnt instruction we are looking at, and what its
|
||||
/// operands are, this method will set the values for each of the cnt
|
||||
/// references provided as arguments.
|
||||
void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt,
|
||||
unsigned &Lgkmcnt, unsigned &Vscnt);
|
||||
|
||||
public:
|
||||
AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
|
||||
const MCInstrInfo &MCII);
|
||||
|
Loading…
Reference in New Issue
Block a user