mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 02:52:53 +02:00
[AMDGPU] Set DS alignment requirements to be more strict
Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are now the same as for other GCN subtargets. This way we can avoid any unintentional use of these instructions on systems that do not support dword alignment and instead require natural alignment. This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default. Differential Revision: https://reviews.llvm.org/D87821
This commit is contained in:
parent
88960bf64d
commit
46431af84c
@ -1437,9 +1437,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
|
||||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
|
||||
// Check if alignment requirements for ds_read/write instructions are
|
||||
// disabled.
|
||||
if (Subtarget->hasUnalignedDSAccessEnabled()) {
|
||||
if (Subtarget->hasUnalignedDSAccessEnabled() &&
|
||||
!Subtarget->hasLDSMisalignedBug()) {
|
||||
if (IsFast)
|
||||
*IsFast = true;
|
||||
*IsFast = Alignment != Align(2);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1455,10 +1456,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
|
||||
}
|
||||
if (Size == 96) {
|
||||
// ds_read/write_b96 require 16-byte alignment on gfx8 and older.
|
||||
bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() &&
|
||||
!Subtarget->hasLDSMisalignedBug())
|
||||
? 4
|
||||
: 16);
|
||||
bool Aligned = Alignment >= Align(16);
|
||||
if (IsFast)
|
||||
*IsFast = Aligned;
|
||||
|
||||
@ -1468,10 +1466,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
|
||||
// ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
|
||||
// can do a 8 byte aligned, 16 byte access in a single operation using
|
||||
// ds_read2/write2_b64.
|
||||
bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() &&
|
||||
!Subtarget->hasLDSMisalignedBug())
|
||||
? 4
|
||||
: 8);
|
||||
bool Aligned = Alignment >= Align(8);
|
||||
if (IsFast)
|
||||
*IsFast = Aligned;
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED %s
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v2:
|
||||
; GCN-DAG: ds_read2_b32
|
||||
@ -21,12 +22,12 @@ bb:
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v4:
|
||||
; VECT-DAG: ds_read_b128
|
||||
; VECT-DAG: ds_write_b128
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
; UNALIGNED-DAG: ds_read_b128
|
||||
; UNALIGNED-DAG: ds_write_b128
|
||||
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -46,12 +47,12 @@ bb:
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v3:
|
||||
; VECT-DAG: ds_read_b96
|
||||
; VECT-DAG: ds_write_b96
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_read_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
; GCN-DAG: ds_write_b32
|
||||
; UNALIGNED-DAG: ds_read_b96
|
||||
; UNALIGNED-DAG: ds_write_b96
|
||||
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -244,7 +244,9 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -202,7 +202,9 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -223,7 +225,9 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read_b64 v[0:1], v0
|
||||
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -212,12 +212,13 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align4:
|
||||
|
@ -178,11 +178,12 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
|
||||
; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align4:
|
||||
@ -208,11 +209,12 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align8:
|
||||
|
@ -480,7 +480,11 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
|
||||
; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
|
||||
; CI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
|
||||
; CI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
|
||||
; GFX9: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]]
|
||||
|
||||
; GFX9-ALIGNED-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
|
||||
; GFX9-ALIGNED-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
|
||||
|
||||
; GFX9-UNALIGNED: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]]
|
||||
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
|
||||
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
|
||||
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
|
||||
|
@ -441,7 +441,11 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
|
||||
; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
|
||||
; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
; GFX9-DAG: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}}
|
||||
|
||||
; GFX9-ALIGNED-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
|
||||
; GFX9-ALIGNED-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
|
||||
; GFX9-UNALIGNED: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}}
|
||||
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
|
||||
@ -514,7 +518,11 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
|
||||
|
||||
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
|
||||
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GFX9: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
|
||||
; GFX9-ALIGNED-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
|
||||
; GFX9-ALIGNED-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
|
||||
; GFX9-UNALIGNED: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
|
||||
|
@ -2,6 +2,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v2:
|
||||
; GCN-DAG: ds_read2_b32
|
||||
@ -21,12 +22,12 @@ bb:
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v4:
|
||||
; VECT-DAG: ds_read_b128
|
||||
; VECT-DAG: ds_write_b128
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
; UNALIGNED-DAG: ds_read_b128
|
||||
; UNALIGNED-DAG: ds_write_b128
|
||||
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -46,12 +47,12 @@ bb:
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v3:
|
||||
; VECT-DAG: ds_read_b96
|
||||
; VECT-DAG: ds_write_b96
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_read_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
; GCN-DAG: ds_write_b32
|
||||
; UNALIGNED-DAG: ds_read_b96
|
||||
; UNALIGNED-DAG: ds_write_b96
|
||||
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -288,7 +288,9 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -239,7 +239,9 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -272,7 +274,9 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -290,12 +290,13 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align4:
|
||||
|
@ -244,11 +244,12 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
|
||||
; GFX9-NEXT: ds_write_b32 v0, v3 offset:8
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align4:
|
||||
@ -288,11 +289,12 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align8:
|
||||
|
@ -179,9 +179,8 @@ entry:
|
||||
; CM: LDS_WRITE
|
||||
; CM: LDS_WRITE
|
||||
|
||||
; SICIVI: ds_write2_b32
|
||||
; SICIVI: ds_write2_b32
|
||||
; GFX9: ds_write_b128
|
||||
; GCN: ds_write2_b32
|
||||
; GCN: ds_write2_b32
|
||||
define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
|
||||
entry:
|
||||
store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
|
||||
|
Loading…
Reference in New Issue
Block a user