mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
72f01d1369
Summary: Reduce the statefulness of the algorithm in two ways: 1. More clearly split generateWaitcntInstBefore into two phases: the first one which determines the required wait, if any, without changing the ScoreBrackets, and the second one which actually inserts the wait and updates the brackets. 2. Communicate pre-existing s_waitcnt instructions using an argument to generateWaitcntInstBefore instead of through the ScoreBrackets. To simplify these changes, a Waitcnt structure is introduced which carries the counts of an s_waitcnt instruction in decoded form. There are some functional changes: 1. The FIXME for the VCCZ bug workaround was implemented: we only wait for SMEM instructions as required instead of waiting on all counters. 2. We now properly track pre-existing waitcnt's in all cases, which leads to less conservative waitcnts being emitted in some cases. s_load_dword ... s_waitcnt lgkmcnt(0) <-- pre-existing wait count ds_read_b32 v0, ... ds_read_b32 v1, ... s_waitcnt lgkmcnt(0) <-- this is too conservative use(v0) more code use(v1) This increases code size a bit, but the reduced latency should still be a win in basically all cases. The worst code size regressions in my shader-db are: WORST REGRESSIONS - Code Size Before After Delta Percentage 1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0] 2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0] 4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0] 2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0] 3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0] Reviewers: msearles, rampitec, scott.linder, kanarayan Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam Differential Revision: https://reviews.llvm.org/D54226 llvm-svn: 347848
38 lines
1.8 KiB
YAML
38 lines
1.8 KiB
YAML
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
|
|
|
|
# GCN-LABEL: name: test{{$}}
|
|
# GCN: S_WAITCNT -16257
|
|
# GGN: DS_READ2_B32
|
|
# GGN: DS_READ2_B32
|
|
# GCN: S_WAITCNT 383{{$}}
|
|
# GCN-NEXT: $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec
|
|
# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
|
|
# GCN-NEXT: S_WAITCNT 127{{$}}
|
|
# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
|
|
--- |
|
|
define amdgpu_cs void @test() {
|
|
ret void
|
|
}
|
|
...
|
|
---
|
|
name: test
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr1, $vgpr0
|
|
|
|
renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0
|
|
renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
|
|
S_WAITCNT -16257
|
|
renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec
|
|
renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec
|
|
renamable $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec
|
|
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
|
|
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
|
|
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
|
|
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
|
|
$vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
|
|
$vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
|
|
IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
|
|
S_ENDPGM
|
|
...
|