mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
2872193f2e
Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829
71 lines
2.4 KiB
YAML
71 lines
2.4 KiB
YAML
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
|
|
|
|
# Check that SILoadStoreOptimizer honors memory dependencies between moved
|
|
# instructions.
|
|
#
|
|
# The following IR snippet would usually be optimized by the peephole optimizer.
|
|
# However, an equivalent situation can occur with buffer instructions as well.
|
|
|
|
# CHECK-LABEL: name: mem_dependency
|
|
# CHECK: DS_READ2_B32 %0, 0, 1,
|
|
# CHECK: DS_WRITE_B32 %0, killed %1, 64,
|
|
# CHECK: DS_READ2_B32 %0, 16, 17,
|
|
# CHECK: DS_WRITE_B32 killed %0, %5, 0
|
|
|
|
--- |
|
|
define amdgpu_kernel void @mem_dependency(i32 addrspace(3)* %ptr.0) nounwind {
|
|
%ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
|
|
%ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
|
|
%1 = load i32, i32 addrspace(3)* %ptr.0
|
|
store i32 %1, i32 addrspace(3)* %ptr.64
|
|
%2 = load i32, i32 addrspace(3)* %ptr.64
|
|
%3 = load i32, i32 addrspace(3)* %ptr.4
|
|
%4 = add i32 %2, %3
|
|
store i32 %4, i32 addrspace(3)* %ptr.0
|
|
ret void
|
|
}
|
|
...
|
|
---
|
|
name: mem_dependency
|
|
alignment: 0
|
|
exposesReturnsTwice: false
|
|
legalized: false
|
|
regBankSelected: false
|
|
selected: false
|
|
tracksRegLiveness: true
|
|
liveins:
|
|
- { reg: '%vgpr0', virtual-reg: '%1' }
|
|
frameInfo:
|
|
isFrameAddressTaken: false
|
|
isReturnAddressTaken: false
|
|
hasStackMap: false
|
|
hasPatchPoint: false
|
|
stackSize: 0
|
|
offsetAdjustment: 0
|
|
maxAlignment: 0
|
|
adjustsStack: false
|
|
hasCalls: false
|
|
maxCallFrameSize: 0
|
|
hasOpaqueSPAdjustment: false
|
|
hasVAStart: false
|
|
hasMustTailInVarArgFunc: false
|
|
body: |
|
|
bb.0:
|
|
liveins: %vgpr0
|
|
|
|
%1:vgpr_32 = COPY %vgpr0
|
|
%m0 = S_MOV_B32 -1
|
|
%2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.ptr.0)
|
|
DS_WRITE_B32 %1, killed %2, 64, 0, implicit %m0, implicit %exec :: (store 4 into %ir.ptr.64)
|
|
|
|
; Make this load unmergeable, to tempt SILoadStoreOptimizer into merging the
|
|
; other two loads.
|
|
%6:vreg_64 = DS_READ2_B32 %1, 16, 17, 0, implicit %m0, implicit %exec :: (load 8 from %ir.ptr.64, align 4)
|
|
%3:vgpr_32 = COPY %6.sub0
|
|
%4:vgpr_32 = DS_READ_B32 %1, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.ptr.4)
|
|
%5:vgpr_32 = V_ADD_I32_e32 killed %3, killed %4, implicit-def %vcc, implicit %exec
|
|
DS_WRITE_B32 killed %1, %5, 0, 0, implicit killed %m0, implicit %exec :: (store 4 into %ir.ptr.0)
|
|
S_ENDPGM
|
|
|
|
...
|