1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[AMDGPU] Specify a triple to avoid codegen changes depending on host OS

This commit is contained in:
Jay Foad 2020-11-03 13:31:59 +00:00
parent d761b7c23c
commit 2d99253720
3 changed files with 107 additions and 176 deletions

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@ -10,7 +10,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
; CI-LABEL: write_ds_sub0_offset0_global:
; CI: ; %bb.0: ; %entry
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_sub_i32_e32 v0, vcc, lds.obj@abs32@lo, v0
; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b32 v0, v1 offset:12
@ -19,7 +19,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
; GFX9-LABEL: write_ds_sub0_offset0_global:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_sub_u32_e32 v0, lds.obj@abs32@lo, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-NEXT: ds_write_b32 v0, v1 offset:12
; GFX9-NEXT: s_endpgm
@ -37,7 +37,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dword s0, s[0:1], 0x9
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_sub_i32_e32 v0, vcc, lds.obj@abs32@lo, v0
; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
; CI-NEXT: s_mov_b64 vcc, 0
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
; CI-NEXT: s_waitcnt lgkmcnt(0)
@ -57,7 +57,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 vcc, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_sub_u32_e32 v0, lds.obj@abs32@lo, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; FIXME: We don't get cases where the address was an SGPR because we
; get a copy to the address register for each one.
@ -13,9 +13,8 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
; CI-LABEL: simple_read2_f32:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v1, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[1:2], v1 offset1:8
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
@ -28,8 +27,7 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
; GFX9-LABEL: simple_read2_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v2
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:8
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
@ -51,9 +49,8 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out)
; CI-LABEL: simple_read2_f32_max_offset:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v1, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[1:2], v1 offset1:255
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
@ -66,8 +63,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out)
; GFX9-LABEL: simple_read2_f32_max_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v2
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:255
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
@ -89,15 +85,14 @@ define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0
; CI-LABEL: simple_read2_f32_too_far:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v1, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read_b32 v2, v1
; CI-NEXT: ds_read_b32 v1, v1 offset:1028
; CI-NEXT: ds_read_b32 v1, v0
; CI-NEXT: ds_read_b32 v2, v0 offset:1028
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: v_add_f32_e32 v2, v1, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@ -105,12 +100,11 @@ define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0
; GFX9-LABEL: simple_read2_f32_too_far:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_add_u32_e32 v1, lds@abs32@lo, v0
; GFX9-NEXT: ds_read_b32 v2, v1
; GFX9-NEXT: ds_read_b32 v1, v1 offset:1028
; GFX9-NEXT: ds_read_b32 v1, v0
; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, v2, v1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -129,18 +123,16 @@ define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
; CI-LABEL: simple_read2_f32_x2:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v3, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:8
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v4, v1, v2
; CI-NEXT: ds_read2_b32 v[1:2], v3 offset0:11 offset1:27
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v1, v1, v2
; CI-NEXT: v_add_f32_e32 v2, v4, v1
; CI-NEXT: v_add_f32_e32 v2, v3, v4
; CI-NEXT: v_add_f32_e32 v2, v1, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@ -148,9 +140,8 @@ define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
; GFX9-LABEL: simple_read2_f32_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX9-NEXT: v_add_u32_e32 v2, lds@abs32@lo, v4
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:11 offset1:27
; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8
; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
@ -188,19 +179,18 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out)
; CI-LABEL: simple_read2_f32_x2_barrier:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v3, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:8
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_barrier
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_add_f32_e32 v4, v1, v2
; CI-NEXT: ds_read2_b32 v[1:2], v3 offset0:11 offset1:27
; CI-NEXT: v_add_f32_e32 v3, v1, v2
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:11 offset1:27
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v1, v1, v2
; CI-NEXT: v_add_f32_e32 v2, v4, v1
; CI-NEXT: v_add_f32_e32 v2, v3, v1
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@ -208,11 +198,10 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out)
; GFX9-LABEL: simple_read2_f32_x2_barrier:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX9-NEXT: v_add_u32_e32 v2, lds@abs32@lo, v4
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_barrier
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:11 offset1:27
; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -253,18 +242,16 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)*
; CI-LABEL: simple_read2_f32_x2_nonzero_base:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v3, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[1:2], v3 offset0:2 offset1:8
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8
; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v4, v1, v2
; CI-NEXT: ds_read2_b32 v[1:2], v3 offset0:11 offset1:27
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v1, v1, v2
; CI-NEXT: v_add_f32_e32 v2, v4, v1
; CI-NEXT: v_add_f32_e32 v2, v3, v4
; CI-NEXT: v_add_f32_e32 v2, v1, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_endpgm
@ -272,9 +259,8 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)*
; GFX9-LABEL: simple_read2_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX9-NEXT: v_add_u32_e32 v2, lds@abs32@lo, v4
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:8
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:11 offset1:27
; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8
; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
@ -422,9 +408,8 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0
; CI-LABEL: read2_ptr_is_subreg_f32:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v1, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[1:2], v1 offset1:8
; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
@ -437,8 +422,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0
; GFX9-LABEL: read2_ptr_is_subreg_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v2
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:8
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
@ -466,15 +450,14 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out)
; CI-LABEL: simple_read2_f32_volatile_0:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v1, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read_b32 v2, v1
; CI-NEXT: ds_read_b32 v1, v1 offset:32
; CI-NEXT: ds_read_b32 v1, v0
; CI-NEXT: ds_read_b32 v2, v0 offset:32
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: v_add_f32_e32 v2, v1, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@ -482,12 +465,11 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out)
; GFX9-LABEL: simple_read2_f32_volatile_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_add_u32_e32 v1, lds@abs32@lo, v0
; GFX9-NEXT: ds_read_b32 v2, v1
; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
; GFX9-NEXT: ds_read_b32 v1, v0
; GFX9-NEXT: ds_read_b32 v2, v0 offset:32
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, v2, v1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -506,15 +488,14 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out)
; CI-LABEL: simple_read2_f32_volatile_1:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v1, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read_b32 v2, v1
; CI-NEXT: ds_read_b32 v1, v1 offset:32
; CI-NEXT: ds_read_b32 v1, v0
; CI-NEXT: ds_read_b32 v2, v0 offset:32
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: v_add_f32_e32 v2, v1, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@ -522,12 +503,11 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out)
; GFX9-LABEL: simple_read2_f32_volatile_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_add_u32_e32 v1, lds@abs32@lo, v0
; GFX9-NEXT: ds_read_b32 v2, v1
; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
; GFX9-NEXT: ds_read_b32 v1, v0
; GFX9-NEXT: ds_read_b32 v2, v0 offset:32
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, v2, v1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -809,9 +789,8 @@ define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
; CI-LABEL: simple_read2_f64:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, lds.f64@abs32@lo, v4
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:8
; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
@ -824,8 +803,7 @@ define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
; GFX9-LABEL: simple_read2_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_add_u32_e32 v0, lds.f64@abs32@lo, v4
; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:8
; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
@ -847,9 +825,8 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out
; CI-LABEL: simple_read2_f64_max_offset:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, lds.f64@abs32@lo, v4
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:255
; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
@ -862,8 +839,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out
; GFX9-LABEL: simple_read2_f64_max_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_add_u32_e32 v0, lds.f64@abs32@lo, v4
; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:255
; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
@ -885,10 +861,9 @@ define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #
; CI-LABEL: simple_read2_f64_too_far:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_add_i32_e32 v3, vcc, lds.f64@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read_b64 v[1:2], v3
; CI-NEXT: ds_read_b64 v[3:4], v3 offset:2056
; CI-NEXT: ds_read_b64 v[1:2], v0
; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
@ -901,9 +876,8 @@ define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #
; GFX9-LABEL: simple_read2_f64_too_far:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_add_u32_e32 v2, lds.f64@abs32@lo, v4
; GFX9-NEXT: ds_read_b64 v[0:1], v2
; GFX9-NEXT: ds_read_b64 v[2:3], v2 offset:2056
; GFX9-NEXT: ds_read_b64 v[0:1], v4
; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
@ -971,7 +945,7 @@ define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, doubl
define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
; CI-LABEL: load_constant_adjacent_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
@ -984,7 +958,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
;
; GFX9-LABEL: load_constant_adjacent_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1003,7 +977,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
; CI-LABEL: load_constant_disjoint_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
@ -1016,7 +990,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
;
; GFX9-LABEL: load_constant_disjoint_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:2
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -1037,7 +1011,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
; CI-LABEL: load_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v2, bar@abs32@lo
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; CI-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
@ -1052,7 +1026,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
;
; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, bar@abs32@lo
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@ -1066,7 +1040,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
;
; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, bar@abs32@lo
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
@ -1088,11 +1062,8 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
; CI-LABEL: load_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_mov_b32 s4, bar.large@abs32@lo
; CI-NEXT: s_add_i32 s5, s4, 0x4000
; CI-NEXT: s_addk_i32 s4, 0x7ff8
; CI-NEXT: v_mov_b32_e32 v0, s5
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, 0x4000
; CI-NEXT: v_mov_b32_e32 v2, 0x7ff8
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
@ -1107,11 +1078,8 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspac
;
; GFX9-LABEL: load_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, bar.large@abs32@lo
; GFX9-NEXT: s_add_i32 s3, s2, 0x4000
; GFX9-NEXT: s_addk_i32 s2, 0x7ff8
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7ff8
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@ -1399,26 +1367,25 @@ bb:
define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) {
; CI-LABEL: ds_read_call_read:
; CI: ; %bb.0:
; CI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; CI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_getpc_b64 s[40:41]
; CI-NEXT: s_mov_b32 s40, s0
; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0
; CI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9
; CI-NEXT: s_load_dword s0, s[0:1], 0xb
; CI-NEXT: s_mov_b32 s42, -1
; CI-NEXT: s_mov_b32 s43, 0xe8f000
; CI-NEXT: s_add_u32 s40, s40, s3
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s40, s40, s3
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: v_add_i32_e32 v40, vcc, s0, v0
; CI-NEXT: s_getpc_b64 s[0:1]
; CI-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4
; CI-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; CI-NEXT: ds_read_b32 v41, v40
; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b32 s39, 0xf000
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@ -1431,24 +1398,23 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
;
; GFX9-LABEL: ds_read_call_read:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_getpc_b64 s[36:37]
; GFX9-NEXT: s_mov_b32 s36, s0
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s3
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshl_add_u32 v40, v0, 2, s2
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: ds_read_b32 v41, v40
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: ds_read_b32 v0, v40 offset:4

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
@ -16,7 +16,6 @@ define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, flo
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
@ -28,7 +27,6 @@ define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, flo
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
; GFX9-NEXT: s_endpgm
@ -54,7 +52,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
@ -67,7 +64,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: s_endpgm
@ -98,7 +94,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
@ -113,7 +108,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -146,7 +140,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
@ -161,7 +154,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -197,7 +189,6 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
@ -207,13 +198,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, lds@abs32@lo
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -242,7 +232,6 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)*
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
@ -252,8 +241,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)*
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v3, lds@abs32@lo
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -283,7 +271,6 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)*
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
@ -293,11 +280,10 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)*
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, lds@abs32@lo
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -324,7 +310,6 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
@ -337,7 +322,6 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
; GFX9-NEXT: s_endpgm
@ -368,7 +352,6 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)*
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
@ -383,7 +366,6 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)*
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -416,7 +398,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C,
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
@ -430,7 +411,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C,
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@ -474,7 +454,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrs
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
@ -488,7 +467,6 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrs
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v0, lds@abs32@lo, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@ -588,7 +566,6 @@ define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, do
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, lds.f64@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
@ -600,7 +577,6 @@ define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, do
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v2, lds.f64@abs32@lo, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
; GFX9-NEXT: s_endpgm
@ -754,7 +730,6 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: v_add_i32_e32 v0, vcc, lds.f64@abs32@lo, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
@ -767,7 +742,6 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8
; GFX9-NEXT: v_add_u32_e32 v4, lds.f64@abs32@lo, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
; GFX9-NEXT: s_endpgm
@ -790,7 +764,7 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() {
; CI-LABEL: store_constant_adjacent_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_movk_i32 s0, 0x7b
; CI-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_mov_b32 m0, -1
@ -800,7 +774,7 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() {
; GFX9-LABEL: store_constant_adjacent_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
@ -813,17 +787,17 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() {
define amdgpu_kernel void @store_constant_disjoint_offsets() {
; CI-LABEL: store_constant_disjoint_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:2
; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_constant_disjoint_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, foo@abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:2
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
; GFX9-NEXT: s_endpgm
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@ -835,21 +809,19 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; CI-LABEL: store_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, bar@abs32@lo
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
; CI-NEXT: ds_write2_b32 v0, v1, v0 offset1:1
; CI-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, bar@abs32@lo
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset1:1
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
@ -858,8 +830,7 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, bar@abs32@lo
; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3]
; GFX9-UNALIGNED-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@ -871,28 +842,22 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_mov_b32 s0, bar.large@abs32@lo
; CI-NEXT: s_add_i32 s1, s0, 0x4000
; CI-NEXT: v_mov_b32_e32 v0, s1
; CI-NEXT: v_mov_b32_e32 v0, 0x4000
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_addk_i32 s0, 0x7ff8
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v0, 0x7ff8
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s0, bar.large@abs32@lo
; GFX9-NEXT: s_add_i32 s1, s0, 0x4000
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_addk_i32 s0, 0x7ff8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4