1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[AMDGPU] Set DS alignment requirements to be more strict

Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are
now the same as for other GCN subtargets. This way we can avoid any
unintentional use of these instructions on systems that do not support dword
alignment and instead require natural alignment.
This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default.

Differential Revision: https://reviews.llvm.org/D87821
This commit is contained in:
Mirko Brkusanin 2020-09-18 15:19:54 +02:00
parent 88960bf64d
commit 46431af84c
15 changed files with 788 additions and 99 deletions

View File

@ -1437,9 +1437,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// Check if alignment requirements for ds_read/write instructions are
// disabled.
if (Subtarget->hasUnalignedDSAccessEnabled()) {
if (Subtarget->hasUnalignedDSAccessEnabled() &&
!Subtarget->hasLDSMisalignedBug()) {
if (IsFast)
*IsFast = true;
*IsFast = Alignment != Align(2);
return true;
}
@ -1455,10 +1456,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
}
if (Size == 96) {
// ds_read/write_b96 require 16-byte alignment on gfx8 and older.
bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() &&
!Subtarget->hasLDSMisalignedBug())
? 4
: 16);
bool Aligned = Alignment >= Align(16);
if (IsFast)
*IsFast = Aligned;
@ -1468,10 +1466,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
// can do a 8 byte aligned, 16 byte access in a single operation using
// ds_read2/write2_b64.
bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() &&
!Subtarget->hasLDSMisalignedBug())
? 4
: 8);
bool Aligned = Alignment >= Align(8);
if (IsFast)
*IsFast = Aligned;

View File

@ -2,6 +2,7 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
@ -21,12 +22,12 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v4:
; VECT-DAG: ds_read_b128
; VECT-DAG: ds_write_b128
; SPLIT-DAG: ds_read2_b32
; SPLIT-DAG: ds_read2_b32
; SPLIT-DAG: ds_write2_b32
; SPLIT-DAG: ds_write2_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read_b128
; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -46,12 +47,12 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v3:
; VECT-DAG: ds_read_b96
; VECT-DAG: ds_write_b96
; SPLIT-DAG: ds_read2_b32
; SPLIT-DAG: ds_read_b32
; SPLIT-DAG: ds_write2_b32
; SPLIT-DAG: ds_write_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()

File diff suppressed because it is too large Load Diff

View File

@ -244,7 +244,9 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -202,7 +202,9 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v3i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@ -223,7 +225,9 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v3i32_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read_b64 v[0:1], v0
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -212,12 +212,13 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align4:

View File

@ -178,11 +178,12 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align4:
@ -208,11 +209,12 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align8:

View File

@ -480,7 +480,11 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
; CI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
; CI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
; GFX9: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]]
; GFX9-ALIGNED-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
; GFX9-ALIGNED-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
; GFX9-UNALIGNED: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]]
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4

View File

@ -441,7 +441,11 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
; GFX9-DAG: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}}
; GFX9-ALIGNED-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
; GFX9-ALIGNED-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
; GFX9-UNALIGNED: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}}
; GCN: s_endpgm
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
@ -514,7 +518,11 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
; GFX9: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
; GFX9-ALIGNED-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
; GFX9-ALIGNED-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
; GFX9-UNALIGNED: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in

View File

@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
@ -21,12 +22,12 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v4:
; VECT-DAG: ds_read_b128
; VECT-DAG: ds_write_b128
; SPLIT-DAG: ds_read2_b32
; SPLIT-DAG: ds_read2_b32
; SPLIT-DAG: ds_write2_b32
; SPLIT-DAG: ds_write2_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read_b128
; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -46,12 +47,12 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v3:
; VECT-DAG: ds_read_b96
; VECT-DAG: ds_write_b96
; SPLIT-DAG: ds_read2_b32
; SPLIT-DAG: ds_read_b32
; SPLIT-DAG: ds_write2_b32
; SPLIT-DAG: ds_write_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -288,7 +288,9 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -239,7 +239,9 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v3i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@ -272,7 +274,9 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v3i32_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -290,12 +290,13 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align4:

View File

@ -244,11 +244,12 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: ds_write_b32 v0, v3 offset:8
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align4:
@ -288,11 +289,12 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v3i32_align8:

View File

@ -179,9 +179,8 @@ entry:
; CM: LDS_WRITE
; CM: LDS_WRITE
; SICIVI: ds_write2_b32
; SICIVI: ds_write2_b32
; GFX9: ds_write_b128
; GCN: ds_write2_b32
; GCN: ds_write2_b32
define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
entry:
store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4