llvm-mirror/test/CodeGen/AMDGPU/merge-load-store.mir

# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s

# Check that SILoadStoreOptimizer honors memory dependencies between moved
# instructions.
#
# The following IR snippet would usually be optimized by the peephole optimizer.
# However, an equivalent situation can occur with buffer instructions as well.

# CHECK-LABEL: name: mem_dependency
# CHECK: DS_READ2_B32 %0, 0, 1,
# CHECK: DS_WRITE_B32 %0, killed %1, 64,
# CHECK: DS_READ2_B32 %0, 16, 17,
# CHECK: DS_WRITE_B32 killed %0, %5, 0

--- |
  define amdgpu_kernel void @mem_dependency(i32 addrspace(3)* %ptr.0) nounwind {
    %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1
    %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16
    %1 = load i32, i32 addrspace(3)* %ptr.0
    store i32 %1, i32 addrspace(3)* %ptr.64
    %2 = load i32, i32 addrspace(3)* %ptr.64
    %3 = load i32, i32 addrspace(3)* %ptr.4
    %4 = add i32 %2, %3
    store i32 %4, i32 addrspace(3)* %ptr.0
    ret void
  }

  @lds0 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
  @lds1 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
  @lds2 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4
  @lds3 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4

  define void @asm_defines_address() #0 {
  bb:
    %tmp1 = load i32, i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0), align 4
    %0 = and i32 %tmp1, 255
    %tmp3 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef), align 4
    %tmp6 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef), align 4
    %tmp7 = tail call i32 asm "v_or_b32 $0, 0, $1", "=v,v"(i32 %tmp6) #1
    %tmp10 = lshr i32 %tmp7, 16
    %tmp11 = and i32 %tmp10, 255
    %tmp12 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp11
    %tmp13 = load i32, i32 addrspace(3)* %tmp12, align 4
    %tmp14 = xor i32 %tmp3, %tmp13
    %tmp15 = lshr i32 %tmp14, 8
    %tmp16 = and i32 %tmp15, 16711680
    %tmp19 = lshr i32 %tmp16, 16
    %tmp20 = and i32 %tmp19, 255
    %tmp21 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp20
    %tmp22 = load i32, i32 addrspace(3)* %tmp21, align 4
    %tmp24 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef), align 4
    %tmp25 = xor i32 %tmp22, %tmp24
    %tmp26 = and i32 %tmp25, -16777216
    %tmp28 = or i32 %0, %tmp26
    store volatile i32 %tmp28, i32 addrspace(1)* undef
    ret void
  }

  attributes #0 = { convergent nounwind }
  attributes #1 = { convergent nounwind readnone }

...
---
name:            mem_dependency
alignment:       0
exposesReturnsTwice: false
legalized:       false
regBankSelected: false
selected:        false
tracksRegLiveness: true
liveins:
  - { reg: '$vgpr0', virtual-reg: '%1' }
frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
  hasPatchPoint:   false
  stackSize:       0
  offsetAdjustment: 0
  maxAlignment:    0
  adjustsStack:    false
  hasCalls:        false
  maxCallFrameSize: 0
  hasOpaqueSPAdjustment: false
  hasVAStart:      false
  hasMustTailInVarArgFunc: false
body:             |
  bb.0:
    liveins: $vgpr0

    %1:vgpr_32 = COPY $vgpr0
    $m0 = S_MOV_B32 -1
    %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)
    DS_WRITE_B32 %1, killed %2, 64, 0, implicit $m0, implicit $exec :: (store 4 into %ir.ptr.64)

    ; Make this load unmergeable, to tempt SILoadStoreOptimizer into merging the
    ; other two loads.
    %6:vreg_64 = DS_READ2_B32 %1, 16, 17, 0, implicit $m0, implicit $exec :: (load 8 from %ir.ptr.64, align 4)
    %3:vgpr_32 = COPY %6.sub0
    %4:vgpr_32 = DS_READ_B32 %1, 4, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.4)
    %5:vgpr_32 = V_ADD_I32_e32 killed %3, killed %4, implicit-def $vcc, implicit $exec
    DS_WRITE_B32 killed %1, %5, 0, 0, implicit killed $m0, implicit $exec :: (store 4 into %ir.ptr.0)
    S_ENDPGM

...
---
# Make sure the asm def isn't moved after the point where it's used for
# the address.
# CHECK-LABEL: name: asm_defines_address
# CHECK: DS_READ2ST64_B32
# CHECK: DS_READ2ST64_B32
# CHECK: INLINEASM
# CHECK: DS_READ_B32
# CHECK: DS_READ_B32
name:            asm_defines_address
tracksRegLiveness: true
registers:
  - { id: 0, class: vgpr_32, preferred-register: '' }
body:             |
  bb.0:
    %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
    %2:vgpr_32 = DS_READ_B32 %1, 3072, 0, implicit $m0, implicit $exec :: (dereferenceable load 4 from `i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0)`, addrspace 3)
    %3:vgpr_32 = DS_READ_B32 %1, 2048, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef)`, addrspace 3)
    %4:vgpr_32 = DS_READ_B32 %1, 1024, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef)`, addrspace 3)
    INLINEASM &"v_or_b32 $0, 0, $1", 32, 327690, def %0, 327689, %4
    %5:vgpr_32 = DS_READ_B32 %0, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 3)
    %6:vgpr_32 = DS_READ_B32 %5, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp21, addrspace 3)
    %7:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef)`, addrspace 3)
    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7

...
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00			`# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s \| FileCheck %s`

			`# Check that SILoadStoreOptimizer honors memory dependencies between moved`
			`# instructions.`
			`#`
			`# The following IR snippet would usually be optimized by the peephole optimizer.`
			`# However, an equivalent situation can occur with buffer instructions as well.`

			`# CHECK-LABEL: name: mem_dependency`
			`# CHECK: DS_READ2_B32 %0, 0, 1,`
			`# CHECK: DS_WRITE_B32 %0, killed %1, 64,`
			`# CHECK: DS_READ2_B32 %0, 16, 17,`
			`# CHECK: DS_WRITE_B32 killed %0, %5, 0`

			`--- \|`
			`define amdgpu_kernel void @mem_dependency(i32 addrspace(3)* %ptr.0) nounwind {`
			`%ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1`
			`%ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16`
			`%1 = load i32, i32 addrspace(3)* %ptr.0`
			`store i32 %1, i32 addrspace(3)* %ptr.64`
			`%2 = load i32, i32 addrspace(3)* %ptr.64`
			`%3 = load i32, i32 addrspace(3)* %ptr.4`
			`%4 = add i32 %2, %3`
			`store i32 %4, i32 addrspace(3)* %ptr.0`
			`ret void`
			`}`
AMDGPU: Fix incorrect reordering when inline asm defines LDS address Defs of operands outside of the instruction's explicit defs need to be checked. llvm-svn: 324554 2018-02-08 02:56:14 +01:00
			`@lds0 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4`
			`@lds1 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4`
			`@lds2 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4`
			`@lds3 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4`

			`define void @asm_defines_address() #0 {`
			`bb:`
			`%tmp1 = load i32, i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0), align 4`
			`%0 = and i32 %tmp1, 255`
			`%tmp3 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef), align 4`
			`%tmp6 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef), align 4`
			`%tmp7 = tail call i32 asm "v_or_b32 $0, 0, $1", "=v,v"(i32 %tmp6) #1`
			`%tmp10 = lshr i32 %tmp7, 16`
			`%tmp11 = and i32 %tmp10, 255`
			`%tmp12 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp11`
			`%tmp13 = load i32, i32 addrspace(3)* %tmp12, align 4`
			`%tmp14 = xor i32 %tmp3, %tmp13`
			`%tmp15 = lshr i32 %tmp14, 8`
			`%tmp16 = and i32 %tmp15, 16711680`
			`%tmp19 = lshr i32 %tmp16, 16`
			`%tmp20 = and i32 %tmp19, 255`
			`%tmp21 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp20`
			`%tmp22 = load i32, i32 addrspace(3)* %tmp21, align 4`
			`%tmp24 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef), align 4`
			`%tmp25 = xor i32 %tmp22, %tmp24`
			`%tmp26 = and i32 %tmp25, -16777216`
			`%tmp28 = or i32 %0, %tmp26`
			`store volatile i32 %tmp28, i32 addrspace(1)* undef`
			`ret void`
			`}`

			`attributes #0 = { convergent nounwind }`
			`attributes #1 = { convergent nounwind readnone }`

AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00			`...`
			`---`
			`name: mem_dependency`
			`alignment: 0`
			`exposesReturnsTwice: false`
			`legalized: false`
			`regBankSelected: false`
			`selected: false`
			`tracksRegLiveness: true`
			`liveins:`
Followup on Proposal to move MIR physical register namespace to '$' sigil. Discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-January/120320.html In preparation for adding support for named vregs we are changing the sigil for physical registers in MIR to '$' from '%'. This will prevent name clashes of named physical register with named vregs. llvm-svn: 323922 2018-01-31 23:04:26 +01:00			`- { reg: '$vgpr0', virtual-reg: '%1' }`
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00			`frameInfo:`
			`isFrameAddressTaken: false`
			`isReturnAddressTaken: false`
			`hasStackMap: false`
			`hasPatchPoint: false`
			`stackSize: 0`
			`offsetAdjustment: 0`
			`maxAlignment: 0`
			`adjustsStack: false`
			`hasCalls: false`
			`maxCallFrameSize: 0`
			`hasOpaqueSPAdjustment: false`
			`hasVAStart: false`
			`hasMustTailInVarArgFunc: false`
			`body: \|`
			`bb.0:`
Followup on Proposal to move MIR physical register namespace to '$' sigil. Discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-January/120320.html In preparation for adding support for named vregs we are changing the sigil for physical registers in MIR to '$' from '%'. This will prevent name clashes of named physical register with named vregs. llvm-svn: 323922 2018-01-31 23:04:26 +01:00			`liveins: $vgpr0`
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00
Followup on Proposal to move MIR physical register namespace to '$' sigil. Discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-January/120320.html In preparation for adding support for named vregs we are changing the sigil for physical registers in MIR to '$' from '%'. This will prevent name clashes of named physical register with named vregs. llvm-svn: 323922 2018-01-31 23:04:26 +01:00			`%1:vgpr_32 = COPY $vgpr0`
			`$m0 = S_MOV_B32 -1`
			`%2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0)`
			`DS_WRITE_B32 %1, killed %2, 64, 0, implicit $m0, implicit $exec :: (store 4 into %ir.ptr.64)`
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00
			`; Make this load unmergeable, to tempt SILoadStoreOptimizer into merging the`
			`; other two loads.`
Followup on Proposal to move MIR physical register namespace to '$' sigil. Discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-January/120320.html In preparation for adding support for named vregs we are changing the sigil for physical registers in MIR to '$' from '%'. This will prevent name clashes of named physical register with named vregs. llvm-svn: 323922 2018-01-31 23:04:26 +01:00			`%6:vreg_64 = DS_READ2_B32 %1, 16, 17, 0, implicit $m0, implicit $exec :: (load 8 from %ir.ptr.64, align 4)`
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00			`%3:vgpr_32 = COPY %6.sub0`
Followup on Proposal to move MIR physical register namespace to '$' sigil. Discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-January/120320.html In preparation for adding support for named vregs we are changing the sigil for physical registers in MIR to '$' from '%'. This will prevent name clashes of named physical register with named vregs. llvm-svn: 323922 2018-01-31 23:04:26 +01:00			`%4:vgpr_32 = DS_READ_B32 %1, 4, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.4)`
			`%5:vgpr_32 = V_ADD_I32_e32 killed %3, killed %4, implicit-def $vcc, implicit $exec`
			`DS_WRITE_B32 killed %1, %5, 0, 0, implicit killed $m0, implicit $exec :: (store 4 into %ir.ptr.0)`
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer Summary: This bug seems to have gone unnoticed because critical cases with LDS instructions are eliminated by the peephole optimizer. However, equivalent situations arise with buffer loads and stores as well, so this fixes regressions since r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4"). Fixes at least: KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs KHR-GL45.cull_distance.functional piglit tes-input-gl_ClipDistance.shader_test ... and probably more Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40303 llvm-svn: 318829 2017-11-22 13:25:21 +01:00			`S_ENDPGM`

			`...`
AMDGPU: Fix incorrect reordering when inline asm defines LDS address Defs of operands outside of the instruction's explicit defs need to be checked. llvm-svn: 324554 2018-02-08 02:56:14 +01:00			`---`
			`# Make sure the asm def isn't moved after the point where it's used for`
			`# the address.`
			`# CHECK-LABEL: name: asm_defines_address`
			`# CHECK: DS_READ2ST64_B32`
			`# CHECK: DS_READ2ST64_B32`
			`# CHECK: INLINEASM`
			`# CHECK: DS_READ_B32`
			`# CHECK: DS_READ_B32`
			`name: asm_defines_address`
			`tracksRegLiveness: true`
			`registers:`
			`- { id: 0, class: vgpr_32, preferred-register: '' }`
			`body: \|`
			`bb.0:`
			`%1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec`
			%2:vgpr_32 = DS_READ_B32 %1, 3072, 0, implicit $m0, implicit $exec :: (dereferenceable load 4 from `i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0)`, addrspace 3)
			%3:vgpr_32 = DS_READ_B32 %1, 2048, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef)`, addrspace 3)
			%4:vgpr_32 = DS_READ_B32 %1, 1024, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef)`, addrspace 3)
			`INLINEASM &"v_or_b32 $0, 0, $1", 32, 327690, def %0, 327689, %4`
			`%5:vgpr_32 = DS_READ_B32 %0, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 3)`
			`%6:vgpr_32 = DS_READ_B32 %5, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp21, addrspace 3)`
			%7:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef)`, addrspace 3)
			`S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7`

			`...`